Field sampling design - IMOS GBR-MGD initiative

Seawater samples were collected for (1) microbial metagenomics and (2) physico-chemical data (temperature, salinity, and particulate/dissolved nutrient concentrations) from 48 offshore reefs across the length of the GBR, within the Great Barrier Reef Microbial Genomics Database (GBR-MGD) initiative by Australia’s Integrated Marine Observing System (IMOS). This sampling was done alongside ongoing in situ health surveys by the Australian Institute of Marine Science Long-Term Monitoring Program (AIMS-LTMP).

The code below was used to create a map showing the 48 IMOS GBR-MGD sites, by combining these two tutorials: 1. https://open-aims.github.io/gisaimsr/articles/examples.html 2. https://r-spatial.org/r/2018/10/25/ggplot2-sf-2.html

# Importing the coordinates
map_coords <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/testing_Tom_Jenkins_script/all_IMOS-MGD_seawater_subset/Metadata_files/MARKO_for_eReefs_Lats_Longs.csv")
# View(map_coords)

# Now converting from data frame into sf format
map_coords <- st_as_sf(map_coords, 
                       coords = c("lon", "lat"), 
                       remove = FALSE, 
                       crs = 4283, # this is the reference code for the CRS system GDA94, used by dataaimsr & gisaimsr R packages
                       agr = "constant")

# I will now add the info on Sampling trip - this will be needed when plotting

# This is the final metadata file, with average values of env. measurements (averaged per Reef site)
map_reef_names_and_trip <- read.csv(file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/metadata_with_reef_names_maps.csv") %>% 
  dplyr::select(REEF_NAME, Sampling_trip)

### 2 ### Renaming the sampling trips to include dates, and make sure they are ordered alphabetically
# First trip
map_reef_names_and_trip$Sampling_trip <- gsub("First", # String to search for
                               "Trip_01_Nov-Dec_2019", # Replace with this
                               as.character(map_reef_names_and_trip$Sampling_trip)) # Column to search in

# Second trip
map_reef_names_and_trip$Sampling_trip <- gsub("Second", # String to search for
                               "Trip_02_January_2020", # Replace with this
                               as.character(map_reef_names_and_trip$Sampling_trip)) # Column to search in

# Third trip
map_reef_names_and_trip$Sampling_trip <- gsub("Third", # String to search for
                               "Trip_03_February_2020", # Replace with this
                               as.character(map_reef_names_and_trip$Sampling_trip)) # Column to search in

# Fourth trip
map_reef_names_and_trip$Sampling_trip <- gsub("Fourth", # String to search for
                               "Trip_04_July_2020", # Replace with this
                               as.character(map_reef_names_and_trip$Sampling_trip)) # Column to search in

# Merging with 'Sampling trip' info
map_coords <- left_join(map_coords, map_reef_names_and_trip, by = c("name" = "REEF_NAME"))

#########################
# Now plotting

# And now defining colors for the map
cols_map <- c("tomato3", # Enclosed Coastal 
              "salmon3", # Macro Tidal Enclosed Coastal
              "pink3", # Macro Tidal Open Coastal
              "peachpuff", # Midshelf
              "lightsteelblue", # Offshore
              "lightcoral") # Open coastal

# Plotting without the mainland - otherwise I am just losing precious space
gbr_no_mainland <- gbr_feat %>%
  dplyr::filter(FEAT_NAME != "Mainland")

# ------------------------------------------------ #
# Overlaying IMOS-MGD sites - only as points first #
# ------------------------------------------------ #

col.per.trip <- factor(map_coords$Sampling_trip, levels = c("Trip_01_Nov-Dec_2019",
                                                            "Trip_02_January_2020",
                                                            "Trip_03_February_2020",
                                                            "Trip_04_July_2020"))
colors <- c("indianred", # Sampling trip 1
            "indianred4", # Sampling trip 2 
            "red3", # Sampling trip 3
            "slateblue") # Sampling trip 4
names(colors) <- c("Trip_01_Nov-Dec_2019",
                   "Trip_02_January_2020",
                   "Trip_03_February_2020",
                   "Trip_04_July_2020")

# Importing city coordinates
oz_cities <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/oz_cities.csv")

# Plot
IMOS_MGD_dots_trip = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
  #  geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
# Include the region info too (the 3 lines below)
#    geom_sf(data = nrm_regions,
#  mapping = aes(fill = NAME), lwd = 0.01) +
#  scale_fill_brewer(name = "Region", palette = "Set3") +
  #  geom_sf(data = wbodies,
  #          mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
  #          lwd = 0.01) +
  geom_sf(data = gbr_feat,
          lwd = 0.01,
          fill = "seashell2",
          colour = NA
  ) +
##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords, # Needs to be a data frame, requires 'geometry'
        aes(color = Sampling_trip), # Coloring sites per sampling trip
        #        alpha = 0.6, # This is to ensure 
        show.legend = "point") +
  coord_sf(xlim = c(142, 154), ylim = c(-10, -27)) +
  geom_text_repel(data = oz_cities, aes(x = Longitude, y = Latitude, label = Town), # repel to make sure the names do not overlap
                  fontface = "bold", # to have the reef names in bold 
                  size=3.2, 
                  col = 'black',
                  nudge_x = c(-0.5, # Townsville
                              -0.9, # Brisbane
                              -0.5, # Cairns
                              -0.8, # Cooktown
                              -0.5, # Mackay
                              -0.7), # Bundaberg
                  nudge_y = c(-0.5, # Townsville
                              0.5, # Brisbane
                              -0.5, # Cairns
                              -0.5, # Cooktown
                              -0.5, # Mackay
                              -0.5), # Bundaberg
  )+ 
  scale_color_manual(name = "Dates of Sampling Transects", values=colors)+
  theme_classic() +
  theme(panel.background = element_rect(fill = "lightblue3",
                                        colour = "lightblue3",
                                        size = 0.5, linetype = "solid")) +
  labs(x = "Longitude",
       y = "Latitude",
       title = "IMOS-MGD",
       subtitle = "Microbial Genomics Database sites") +
#  scale_fill_manual(name = "Type of Water Body", values = cols_map) +
  theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_dots_trip
Field sampling design for the GBR-MGD (Great Barrier Reef Microbial Genomics Database) dataset. (above) Seawater was collected from 48 offshore GBR reef sites for microbial community metagenomic sequencing and water chemistry analysis over four trips between November 2019 and July 2020. Reef sites are coloured in red or blue tones to denote trips that occurred during the austral summer (wet season) or austral winter (dry season), respectively. (bellow) A more detailed map showing the name of each reef site, and their membership to either offshore (41 reefs) or mid-shelf (7 reefs) waters. No inshore sites were sampled.

Field sampling design for the GBR-MGD (Great Barrier Reef Microbial Genomics Database) dataset. (above) Seawater was collected from 48 offshore GBR reef sites for microbial community metagenomic sequencing and water chemistry analysis over four trips between November 2019 and July 2020. Reef sites are coloured in red or blue tones to denote trips that occurred during the austral summer (wet season) or austral winter (dry season), respectively. (bellow) A more detailed map showing the name of each reef site, and their membership to either offshore (41 reefs) or mid-shelf (7 reefs) waters. No inshore sites were sampled.

# Plot
IMOS_MGD_dots_trip_black = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
  #  geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
# Include the region info too (the 3 lines below)
#    geom_sf(data = nrm_regions,
#  mapping = aes(fill = NAME), lwd = 0.01) +
#  scale_fill_brewer(name = "Region", palette = "Set3") +
  #  geom_sf(data = wbodies,
  #          mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
  #          lwd = 0.01) +
  geom_sf(data = gbr_feat,
          lwd = 0.01,
          fill = "black",
          colour = NA
  ) +
##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords, # Needs to be a data frame, requires 'geometry'
        aes(color = Sampling_trip), # Coloring sites per sampling trip
        #        alpha = 0.6, # This is to ensure 
        show.legend = "point") +
  coord_sf(xlim = c(142, 154), ylim = c(-10, -27)) +
  geom_text_repel(data = oz_cities, aes(x = Longitude, y = Latitude, label = Town), # repel to make sure the names do not overlap
                  fontface = "bold", # to have the reef names in bold 
                  size=3.2, 
                  col = 'black',
                  nudge_x = c(-0.5, # Townsville
                              -0.9, # Brisbane
                              -0.5, # Cairns
                              -0.8, # Cooktown
                              -0.5, # Mackay
                              -0.7), # Bundaberg
                  nudge_y = c(-0.5, # Townsville
                              0.5, # Brisbane
                              -0.5, # Cairns
                              -0.5, # Cooktown
                              -0.5, # Mackay
                              -0.5), # Bundaberg
  )+ 
  scale_color_manual(name = "Dates of Sampling Transects", values=colors)+
  theme_classic() +
  theme(panel.background = element_rect(fill = "lightblue3",
                                        colour = "lightblue3",
                                        size = 0.5,
                                        linetype = "solid"
                                        )) +
  labs(x = "Longitude",
       y = "Latitude",
       title = "IMOS-MGD",
       subtitle = "Microbial Genomics Database sites") +
#  scale_fill_manual(name = "Type of Water Body", values = cols_map) +
  theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_dots_trip_black
Field sampling design for the GBR-MGD (Great Barrier Reef Microbial Genomics Database) dataset. (above) Seawater was collected from 48 offshore GBR reef sites for microbial community metagenomic sequencing and water chemistry analysis over four trips between November 2019 and July 2020. Reef sites are coloured in red or blue tones to denote trips that occurred during the austral summer (wet season) or austral winter (dry season), respectively. (bellow) A more detailed map showing the name of each reef site, and their membership to either offshore (41 reefs) or mid-shelf (7 reefs) waters. No inshore sites were sampled.

Field sampling design for the GBR-MGD (Great Barrier Reef Microbial Genomics Database) dataset. (above) Seawater was collected from 48 offshore GBR reef sites for microbial community metagenomic sequencing and water chemistry analysis over four trips between November 2019 and July 2020. Reef sites are coloured in red or blue tones to denote trips that occurred during the austral summer (wet season) or austral winter (dry season), respectively. (bellow) A more detailed map showing the name of each reef site, and their membership to either offshore (41 reefs) or mid-shelf (7 reefs) waters. No inshore sites were sampled.

To show reefs in more detail, we also plot a close-up of sites within each trip.

# But need to split map_coords file per trip
map_coords_trip1 <- filter(map_coords, Sampling_trip=="Trip_01_Nov-Dec_2019")
map_coords_trip2 <- filter(map_coords, Sampling_trip=="Trip_02_January_2020")
map_coords_trip3 <- filter(map_coords, Sampling_trip=="Trip_03_February_2020")
map_coords_trip4 <- filter(map_coords, Sampling_trip=="Trip_04_July_2020")
IMOS_MGD_trip1 = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
  #  geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
  geom_sf() +
  #  geom_sf(data = wbodies,
  #          mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
  #          lwd = 0.01) +
  geom_sf(data = gbr_feat,
          lwd = 0.01,
          fill = "seashell2",
          colour = NA
  ) +
##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords_trip1, # Needs to be a data frame, requires 'geometry'
        aes(color = Sampling_trip), # Coloring sites per sampling trip
        show.legend = "point") +
  geom_text_repel(data = map_coords_trip1, aes(x = lon, y = lat, label = name, colour = Sampling_trip), # repel to make sure the names do not overlap
                  fontface = "bold", # to have the reef names in bold 
                  size=3.2, 
                  segment.color = "black",
                  segment.alpha = 0.6,
                  segment.size = 0.1,
                  nudge_x = c(2.2, # MCSWEENEY REEF 
                              2.4, # MONSOON REEF, - sign means it will move to the left 
                              1.2, # 11-049
                              1.2, # 11-162
                              0.9, # MANTIS REEF
                              1.6, # LAGOON REEF
                              0.4, # DAVIE REEF
                              -0.2, # CORBETT REEF
                              0.4, # 13-124
                              -0.1, # SANBANK 1 REEF
                              0.5), # St Crispin
                  # This should be 48 times, for our 48 sites
                  nudge_y = c(0.2, # MCSWEENEY REEF 
                              0.1, # MONSOON REEF 
                              0.1, # 11-049, - sign means it will go down
                              0.1, # 11-162
                              0.2, # MANTIS REEF
                              -0.3, # LAGOON REEF
                              0.2, # DAVIE REEF
                              -0.8, # CORBETT REEF
                              0.3, # 13-124
                              -1.6, # SANDBANK 1 REEF
                              0.5), ) + # St Crispin
  coord_sf(xlim = c(143, 147), ylim = c(-11, -16.5)) +
  scale_color_manual(name = "Sampling trip", values=c("indianred")) + # color I am using for Sampling trip 1
  theme_classic() +
  theme(panel.background = element_rect(fill = "lightblue3",
                                        colour = "lightblue3",
                                        size = 0.5, linetype = "solid")) +
  labs(x = "Longitude",
       y = "Latitude",
       title = "IMOS Microbial Genomics Database sites",
       subtitle = "Trip 1 (Nov-Dec 2019)")
  #  scale_fill_manual(name = "Type of Water Body", values = cols_map) +
#  theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_trip1

IMOS_MGD_trip2 = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
  #  geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
  geom_sf() +
  #  geom_sf(data = wbodies,
  #          mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
  #          lwd = 0.01) +
  geom_sf(data = gbr_feat,
          lwd = 0.01,
          fill = "seashell2",
          colour = NA
  ) +
##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords_trip2, # Needs to be a data frame, requires 'geometry'
        aes(color = Sampling_trip), # Coloring sites per sampling trip
        show.legend = "point") +
  geom_text_repel(data = map_coords_trip2, aes(x = lon, y = lat, label = name, colour = Sampling_trip), # repel to make sure the names do not overlap
                  fontface = "bold", # to have the reef names in bold 
                  size=3.2, 
                  segment.color = "black",
                  segment.alpha = 0.6,
                  segment.size = 0.1,
                  nudge_x = c(-0.1, # FAIRFAX REEF
                    -0.4, # HOSKYN REEF
                    0.3, # BOULT REEF
                    0.3, # MASTHEAD REEF
                    -0.2, # ERSKINE REEF
                    0.4, # BROOMFIELD REEF
                    0.1, # 21-550
                    -0.5, # 22-084
                    0.4, # CHINAMAN REEF
                    -0.1, # 21-580
                    0.2, # SMALL LAGOON REEF
                    -0.3), # NORTH REEF
                  # This should be 48 times, for our 48 sites
                  nudge_y = c(-0.1, # FAIRFAX REEF
                    -0.1, # HOSKYN REEF
                    0.2, # BOULT REEF
                    -0.1, # MASTHEAD REEF
                    0.2, # ERSKINE REEF
                    0.2, # BROOMFIELD REEF
                    0.2, # 21-550
                    -0.3, # 22-084
                    0.2, # CHINAMAN REEF
                    -0.6, # 21-580
                    0.4, # SMALL LAGOON REEF
                    0.1), # NORTH REEF
                  ) + # SANDBANK 1 REEF
  coord_sf(xlim = c(151, 153), ylim = c(-21.5, -24)) +
  scale_color_manual(name = "Sampling trip", values=c("indianred4")) + # color I am using for Sampling trip 1
  theme_classic() +
  theme(panel.background = element_rect(fill = "lightblue3",
                                        colour = "lightblue3",
                                        size = 0.5,
                                        linetype = "solid")) +
  labs(x = "Longitude",
       y = "Latitude",
       title = "IMOS Microbial Genomics Database sites",
       subtitle = "Trip 2 (January 2020)")
  #  scale_fill_manual(name = "Type of Water Body", values = cols_map) +
#  theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_trip2

IMOS_MGD_trip3 = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
  #  geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
  geom_sf() +
  #  geom_sf(data = wbodies,
  #          mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
  #          lwd = 0.01) +
  geom_sf(data = gbr_feat,
          lwd = 0.01,
          fill = "seashell2",
          colour = NA
  ) +
##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords_trip3, # Needs to be a data frame, requires 'geometry'
        aes(color = Sampling_trip), # Coloring sites per sampling trip
        show.legend = "point") +
  geom_text_repel(data = map_coords_trip3, aes(x = lon, y = lat, label = name, colour = Sampling_trip), # repel to make sure the names do not overlap
                  fontface = "bold", # to have the reef names in bold 
                  size=3.2, 
                  segment.color = "black",
                  segment.alpha = 0.6,
                  segment.size = 0.1,
                  nudge_x = c(#0.3, # ST CRISPIN
                              0.3, # AGINCOURT1 REEF
                              0.3, # HASTINGS REEF
                              0.3, # ARLINGTON REEF
                              0.4, # THETFORD REEF
                              0.4, # MOORE REEF
                              0.3, # HEDLEY REEF
                              0.3, # MCCULLOCH REEF
                              0.4, # PEART REEF
                              0.4, # FEATHER REEF
                              0.1, # FARQUAHARSON REEF
                              0.3), # TAYLOR REEF
                  # This should be 48 times, for our 48 sites
                  nudge_y = c(#-0.1, # ST CRISPIN
                              0.2, # AGINCOURT1 REEF
                              0.2, # HASTINGS REEF
                              0.1, # ARLINGTON REEF
                              0.2, # THETFORD REEF
                              0.1, # MOORE REEF
                              0.1, # HEDLEY REEF
                              0.2, # MCCULLOCH REEF
                              0.1, # PEART REEF
                              -0.1, # FEATHER REEF
                              0.2, # FARQUAHARSON REEF
                              -0.1), # TAYLOR REEF
  ) +
  coord_sf(xlim = c(145.4, 147), ylim = c(-15.8, -18)) +
  scale_color_manual(name = "Sampling trip", values=c("red"))+
  theme_classic() +
  theme(panel.background = element_rect(fill = "lightblue3",
                                        colour = "lightblue3",
                                        size = 0.5,
                                        linetype = "solid")) +
  labs(x = "Longitude",
       y = "Latitude",
       title = "IMOS Microbial Genomics Database sites",
       subtitle = "Trip 3 (February 2020)") +
  #  scale_fill_manual(name = "Type of Water Body", values = cols_map) +
  theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_trip3

IMOS_MGD_trip4 = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
  #  geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
  geom_sf() +
  #  geom_sf(data = wbodies,
  #          mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
  #          lwd = 0.01) +
  geom_sf(data = gbr_feat,
          lwd = 0.01,
          fill = "seashell2",
          colour = NA
  ) +
  ##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords_trip4, # Needs to be a data frame, requires 'geometry'
        aes(color = Sampling_trip), # Coloring sites per sampling trip
        show.legend = "point") +
  geom_text_repel(data = map_coords_trip4, aes(x = lon, y = lat, label = name, colour = Sampling_trip), # repel to make sure the names do not overlap
                  fontface = "bold", # to have the reef names in bold 
                  size=3.2, 
                  segment.color = "black",
                  segment.alpha = 0.6,
                  segment.size = 0.1,
                  nudge_x = c(-0.2, # LITTLE KELSO REEF
                              -0.2, # KELSO REEF
                              0.5, # ROXBURGH REEF
                              0.2, # FORE&AFT REEF
                              0.2, # RIB REEF
                              -0.1, # JOHN BREWER REEF
                              0.1, # MYRMIDON REEF
                              -0.3, # CHICKEN REEF
                              0.3, # KNIFE REEF
                              0.2, # FORK REEF
                              0.2, # LYNCHS REEF
                              -0.1, # CENTIPEDE REEF
                              -0.1, # GRUB REEF
                              -0.2), # HELIX REEF
                  # This should be 48 times, for our 48 sites
                  nudge_y = c(-0.2, # LITTLE KELSO REEF
                              -0.1, # KELSO REEF
                              0.1, # ROXBURGH REEF
                              0, # FORE&AFT REEF
                              0.1, # RIB REEF
                              -0.2, # JOHN BREWER REEF
                              0.1, # MYRMIDON REEF
                              -0.3, # CHICKEN REEF
                              0.1, # KNIFE REEF
                              0, # FORK REEF
                              -0.2, # LYNCHS REEF
                              -0.1, # CENTIPEDE REEF
                              -0.1, # GRUB REEF
                              -0.2), # HELIX REEF
  ) +
  coord_sf(xlim = c(146.8, 148), ylim = c(-18.1, -19)) +
  scale_color_manual(name = "Sampling trip", values=c("slateblue"))+
  theme_classic() +
  theme(panel.background = element_rect(fill = "lightblue3",
                                        colour = "lightblue3",
                                        size = 0.5,
                                        linetype = "solid")) +
  labs(x = "Longitude",
       y = "Latitude",
       title = "IMOS Microbial Genomics Database sites",
       subtitle = "Trip 4 (July 2020)") +
  #  scale_fill_manual(name = "Type of Water Body", values = cols_map) +
  theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_trip4

plot_grid(IMOS_MGD_trip1, IMOS_MGD_trip2, IMOS_MGD_trip3, IMOS_MGD_trip4,
          nrow = 1,
          ncol = 4)

Physico-chemical data - collection, pre-processing, and analysis

wrangling <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/metadata_wrangling.csv")

WQ_methods <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/WQ_IDs_IMOS-MGD_only.csv")
# And this is one of the WQ spreadsheets - link between WQ IDs and our Reef names 

# Selecting only the columns of interest
WQ_methods <- dplyr::select(WQ_methods, one_of(c("REEF_NAME",
                                          "WQ_Station_Name",
                                          "Collection_method", # diving or from boat
                                          "Sample_collection_start")))#,
# We decided not to include the metrics bellow
                                          # "Swell_direction",
                                          # "Swell_height",
                                          # "Wind_direction",
                                          #"Wind_speed" )))

# Additional data from the LTMP trips
LTMP <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/metadata_IMOS_from_Mike.csv") %>% 
  dplyr::select("REEF_NAME", "Sampling_trip", "GBR_sector", "SAMPLE_DATE", "Lat", "Long")
### 2 ### Renaming the sampling trips to include dates, and make sure they are ordered alphabetically
# First trip
LTMP$Sampling_trip <- gsub("First", # String to search for
                               "Trip_01_Nov-Dec_2019", # Replace with this
                               as.character(LTMP$Sampling_trip)) # Column to search in

# Second trip
LTMP$Sampling_trip <- gsub("Second", # String to search for
                               "Trip_02_January_2020", # Replace with this
                               as.character(LTMP$Sampling_trip)) # Column to search in

# Third trip
LTMP$Sampling_trip <- gsub("Third", # String to search for
                               "Trip_03_February_2020", # Replace with this
                               as.character(LTMP$Sampling_trip)) # Column to search in

# Fourth trip
LTMP$Sampling_trip <- gsub("Fourth", # String to search for
                               "Trip_04_July_2020", # Replace with this
                               as.character(LTMP$Sampling_trip)) # Column to search in

# Joining - first step
metadata <- left_join(wrangling, LTMP)
# In this step I added the Sample IDs to the LTMP data
metadata <- left_join(metadata, WQ_methods)
# And in here the info from the WQ team

# importing the actual water chemistry measurements
WQ_Result_Report <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/MBM_Result_Report_R_Dec_2022.csv")

# Removing Temperature and Salinity for now - too many missing values
WQ_Result_Report <- dplyr::select(WQ_Result_Report, one_of(c("WQ_Station_Name",
                                                      "DEPTH",
                                                      "Chlorophyll_a_.µg.L.",
                                                      "Phaeophytin_a_.µg.L.",
                                                      "PN_.µM.",
                                                      "POC_.µM.",
                                                      "PP_.µM.",
                                                      "DOC_.µM.",
                                                      "PO4_.µM.",
                                                      "NH4_.µM.",
                                                      "NO2_.µM.",
                                                      "NO3_.µM.",
                                                      "Si_.µM.",
                                                      "TDN_.µM.",
                                                      "TDP_.µM.",
                                                      "TSS_.mg.L.")))
                                                      # "Salinity",
                                                      # "Temperature.C..")))

# Now adding the Reef_name info
wq.all.reps.for.pca <- left_join(WQ_methods, WQ_Result_Report)

# I also need the info on Sampling trips - to color the groups on the PCA. I will also add the data from the research vessel at this stage - Temperature, Salinity, Turbidity, Fluorescence
reefs_trips <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/metadata_with_reef_names.csv")
# But I only want reef names and their corresponding trips for now!
reefs_trips <- reefs_trips[,c(1,3)]

### 2 ### Renaming the sampling trips to include dates, and make sure they are ordered alphabetically
# First trip
reefs_trips$Sampling_trip <- gsub("First", # String to search for
                                              "Trip_01_Nov-Dec_2019", # Replace with this
                                              as.character(reefs_trips$Sampling_trip)) # Column to search in

# Second trip
reefs_trips$Sampling_trip <- gsub("Second", # String to search for
                                              "Trip_02_January_2020", # Replace with this
                                              as.character(reefs_trips$Sampling_trip)) # Column to search in

# Third trip
reefs_trips$Sampling_trip <- gsub("Third", # String to search for
                                              "Trip_03_February_2020", # Replace with this
                                              as.character(reefs_trips$Sampling_trip)) # Column to search in

# Fourth trip
reefs_trips$Sampling_trip <- gsub("Fourth", # String to search for
                                              "Trip_04_July_2020", # Replace with this
                                              as.character(reefs_trips$Sampling_trip)) # Column to search in

# Now adding the metadata from the RV
vessel_metadata <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/GBR-Genomics-Database_Seawater-Illumina-Reads.csv")
# Keeping only: Temperature, Salinity, Turbidity, Fluorescence
vessel_metadata <- dplyr::select(vessel_metadata, one_of(c("REEF_NAME",
                                                           "SEAWATER_TEMPERATURE_2.5m_RV",
                                                           "SALINITY_2.5m_RV",
#                                                           "TURBIDITY_2.5m_RV",
                                                           "FLUORESCENCE_2.5m_RV")))

# Joining now
reefs_trips <- left_join(reefs_trips,
                         vessel_metadata)
wq.all.reps.for.pca <- left_join(wq.all.reps.for.pca,
                                 reefs_trips)

Principal Components Analysis (PCA) - What are the main clustering patterns between our reefs based on physico-chemical data?

PCA was applied on a physico-chemical dataset containing 17 variables, including: 1. 14 water chemistry variables: ammonia (NH4), nitrite (NO2), nitrate (NO3), total dissolved nitrogen (TDN), phosphate (PO4), total dissolved phosphorus (TDP), dissolved organic carbon (DOC), silicate (Si), total suspended solids (TSS), chlorophyll a (Chl-a), phaeophytin a (Phaeo), particulate organic carbon (POC), particulate nitrogen (PN), and particulate phosphorus (PP). For each of these 14 water chemistry variables, triplicate 5 L seawater samples were collected using Niskin bottles for analysis of water chemistry variables, at each of the 48 reefs. 2. temperature, fluorescence, and salinity measurements from the underway sampling systems on the RV Solander and RV Cape Ferguson, with intake depths for underway systems were 1.9 m (RV Cape Ferguson) and 2.5 m (RV Solander). For these three measurements, one value per reef site was recorded.

Choosing the number of components

The mixOmics function tune.pca() calculates the cumulative proportion of explained variance for a large number of principal components (here we set ncomp = 10). A screeplot of the proportion of explained variance relative to the total amount of variance in the data for each principal component is output.

# In PCA, we first count the number of missing values, as this will tell us whether PCA will be solved using SVD (no missing values) or iterative NIPALS (with missing values) internally in the mixOmics function pca().
sum(is.na(wq.all.reps.for.pca[,c(6:14, 16:23)]))
## [1] 17
# Number of NAs
## [1] 17
# Since we have some missing values, the iterative NIPALS will be called inside pca()

tune.pca.WQ <- tune.pca(wq.all.reps.for.pca[,c(6:14, 16:23)], ncomp = 10, scale = TRUE)
plot(tune.pca.WQ)
Screeplot from the PCA performed on the IMOS GBR-MGD physico-chemical data: Amount of explained variance for each principal component is shown. From the numerical output (shown bellow in tabular format), we observe that the first two principal components explain 60.31% of the total variance. The rule of thumb for choosing the number of PCA components is not so much to set a hard threshold based on the cumulative proportion of explained variance (as this is data-dependent), but to observe when a drop, or elbow, appears on the screeplot. The elbow indicates that the remaining variance is spread over many principal components and is not relevant in obtaining a low-dimensional ‘snapshot’ of the data. Based on this, we chose to keep two PCA dimensions.

Screeplot from the PCA performed on the IMOS GBR-MGD physico-chemical data: Amount of explained variance for each principal component is shown. From the numerical output (shown bellow in tabular format), we observe that the first two principal components explain 60.31% of the total variance. The rule of thumb for choosing the number of PCA components is not so much to set a hard threshold based on the cumulative proportion of explained variance (as this is data-dependent), but to observe when a drop, or elbow, appears on the screeplot. The elbow indicates that the remaining variance is spread over many principal components and is not relevant in obtaining a low-dimensional ‘snapshot’ of the data. Based on this, we chose to keep two PCA dimensions.

# Numerical output
pca.wq.all.reps <- pca(wq.all.reps.for.pca[,c(6:14, 16:23)], # getting the numerical values only
                       ncomp = 10,
                       center = TRUE,
                       scale = TRUE)

# Explained variance per PCA component
knitr::kable(pca.wq.all.reps$prop_expl_var$X, caption = "The proportion of explained variance per each PCA component is:")
The proportion of explained variance per each PCA component is:
x
PC1 0.4066099
PC2 0.1964915
PC3 0.0910662
PC4 0.0643839
PC5 0.0470814
PC6 0.0422856
PC7 0.0305387
PC8 0.0288760
PC9 0.0231195
PC10 0.0200885
# The cumulative proportion of variance explained by each PCA component
knitr::kable(pca.wq.all.reps$cum.var, caption = "The cumulative proportion of variance explained by each PCA component")
The cumulative proportion of variance explained by each PCA component
x
PC1 0.4066099
PC2 0.6031014
PC3 0.6941676
PC4 0.7585515
PC5 0.8056329
PC6 0.8479185
PC7 0.8784572
PC8 0.9073332
PC9 0.9304527
PC10 0.9505412

Visualising patterns based on the final PCA model (with only the first two PCA dimensions)

PCA_WQ_sample_plot <- plotIndiv(pca.wq.all.reps,
          comp = c(1, 2), 
          group = wq.all.reps.for.pca$Sampling_trip, 
#          ind.names = wq.all.reps.for.pca$REEF_NAME,
          ellipse = T,
          col.per.group =c("indianred", # Sampling trip 1
                           "indianred4", # Sampling trip 2 
                           "red3", # Sampling trip 3
                           "slateblue"), # Sampling trip 4
          legend = TRUE,
          title = 'WQ Metadata all reps, PCA comp 1 - 2')

PCA_WQ_biplot <- biplot(pca.wq.all.reps,
       comp = c(1, 2), 
       group = wq.all.reps.for.pca$Sampling_trip,
#       ind.names = wq.all.reps.for.pca$REEF_NAME,
       col.per.group =c("indianred", # Sampling trip 1
                        "indianred4", # Sampling trip 2
                        "red3", # Sampling trip 3
                        "slateblue"), # Sampling trip 4
       legend = TRUE,
       legend.title = "Sampling trip",
       title = 'PCA biplot for WQ Metadata all reps, PCA comp 1 - 2')

The PCA results suggest that our water chemistry measurements from across the GBR were largely driven by seasonality, while geography had a weaker influence. Chemistry profiles of samples collected in early austral summer were comparable despite being >1500 km apart in the far north (Cape Grenville and Princess Charlotte bay sectors) and far south (Swains and Capricorn Bunker sectors) of the GBR, whereas samples collected during the peaks of austral summer and winter were the most distinct although they were geographically close in the central GBR (~200 km apart, Cairns and Cooktown / Lizard island sectors for austral summer samples, and Innisfail and Townsville sectors for austral winter samples). Further, we observe that summer trips 1-3 were characterised by elevated temperature and higher concentrations of dissolved and particulate nutrients, apart frpm TDP and phosphate which were elevated during winter.

However, we did not show reef names in either of the PCA plots as there is an overlap between data points (and hence the text is not readable), and also in these PCA visualisations, we lose context of raw values. This information was added with a heatmap (to compare physico-chemical metrics across sites) and with boxplots (which show the raw physico-chemical measurements).

Heatmap showing physico-chemical measurements across sites

We first collapsed the data to a mean/median value because for each of the 17 environmental metrics we computed the median value per reef site as the number of Niskin deployments differed for molecular (four replicates) and water chemistry (three replicates) sampling.

# Making the heatmap of LTMP data
WQ_heatmap <- metadata[,24:40] %>% # I am only choosing columns with median values
  scale(center = TRUE, scale = TRUE) %>% # I wand the values to be scaled
  as.data.frame() %>% # Converting back to data frame - ggplot needs this
  rownames_to_column("Sample_ID") %>% # Setting rownames as Col 1 - will need this for melting
  reshape2::melt() %>% # Getting the long format - this is what geom_tile needs
  left_join(metadata[, c(1,2)] # adding back the REEF NAME and SAMPLING TRIP vars
            %>% rownames_to_column("Sample_ID")
            ) %>% # Need to convert row names to Column 1 and give Sample_ID as name, because I am joining those with the same ID
  ggplot(aes(x = REEF_NAME, y = variable, fill = value)) +
  geom_tile() + # Plotting the heatmap here
  scale_fill_gradient2(low = "#075AFF",
                       mid = "#FFFFCC",
                       high = "#FF0000") + # The coloring scheme - red for high vals, blue for low
  facet_wrap(~Sampling_trip, scales = "free_x", ncol = 4) + # now facetting reef sites based on the Sampling trip
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))
WQ_heatmap
The heatmap shows the level of change in all 17 physico-chemical variables (y axis) across the reef sites (x axis), grouped within their corresponding sampling trip. Environmental measurements were centered (median = 0) and scaled (standard deviation (SD) = 1) across reef sites, and values that deviate from the median (0) were shown in red (> median) and blue (< median). This heatmap was combined in Inkscape with the PCA visualisation for physico-chemical data to re-introduce the context of reef sites, which were not visualised in the PCA.

The heatmap shows the level of change in all 17 physico-chemical variables (y axis) across the reef sites (x axis), grouped within their corresponding sampling trip. Environmental measurements were centered (median = 0) and scaled (standard deviation (SD) = 1) across reef sites, and values that deviate from the median (0) were shown in red (> median) and blue (< median). This heatmap was combined in Inkscape with the PCA visualisation for physico-chemical data to re-introduce the context of reef sites, which were not visualised in the PCA.

Box plots showing raw physico-chemical measurements, as well as mean and median values

# Median is the default in ggplot2
reshape2::melt(wq.all.reps.for.pca[,c(1, # Reef name
                                      6:19, # All numerical vals
                                      21, # Temperature
                                      22, # Salinity
                                      23, # Fluorescence
                                      20)]) %>% # Sampling trip
  ggplot(aes(y = value,
             x = Sampling_trip,
             fill = Sampling_trip),
         alpha=0.8) +
  geom_boxplot(#outlier.colour="red",
               outlier.shape=8,
               outlier.size=4) +
  geom_jitter(alpha = 0.6,
              size = 0.8) +
  stat_summary(fun=mean,
               geom="point",
               shape=20,
               size=0.8,
               color="seagreen1",
               fill="seagreen1") + # Plotting the mean as a green dot!
  facet_grid(rows = vars(variable),
             cols = vars(Sampling_trip),
             scales = "free"
             ) +
  scale_fill_manual(values = c("indianred", # Sampling trip 1
                           "indianred4", # Sampling trip 2 
                           "red3", # Sampling trip 3
                           "slateblue") # Sampling trip 4
                    ) +
  labs(y = "WQ metrics",
       x = "Reef sites",
       title = "Boxplots for WQ metrics (Median & Mean)"
       ) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8))
FIG CAP TO BE ADDED.

FIG CAP TO BE ADDED.

Getting the numerical summary of physico-chemical variables

# Data needs to be in long format
wq_median_per_trip <- reshape2::melt(wq.all.reps.for.pca[,c(1, # Reef name
                                      6:19, # All numerical vals
                                      21, # Temperature
                                      22, # Salinity
                                      23, # Fluorescence
                                      20)]) %>% # Sampling_trip
  as.data.frame() %>% 
  group_by(Sampling_trip, variable) %>% 
# Now computing mean and SD
dplyr::summarize( # This tutorial for troubleshooting! https://stackoverflow.com/questions/46661461/calculate-mean-by-group-using-dplyr-package
          median=round(median(value, na.rm=TRUE),
                       digits = 2)
          ) %>%
  reshape2::dcast(variable~Sampling_trip)
# Showing as table
knitr::kable(wq_median_per_trip, caption = "Median for all 17 physico-chemical metrics, collapsed across the four sampling trips.")
Median for all 17 physico-chemical metrics, collapsed across the four sampling trips.
variable Trip_01_Nov-Dec_2019 Trip_02_January_2020 Trip_03_February_2020 Trip_04_July_2020
Chlorophyll_a_.µg.L. 0.17 0.15 0.23 0.10
Phaeophytin_a_.µg.L. 0.17 0.17 0.36 0.10
PN_.µM. 1.23 1.19 1.30 0.50
POC_.µM. 7.12 7.77 9.74 3.52
PP_.µM. 0.04 0.04 0.06 0.02
DOC_.µM. 83.75 79.58 65.83 69.53
PO4_.µM. 0.05 0.04 0.02 0.09
NH4_.µM. 0.32 0.52 0.68 0.11
NO2_.µM. 0.02 0.04 0.03 0.01
NO3_.µM. 0.21 0.34 0.21 0.20
Si_.µM. 1.38 1.14 2.00 1.86
TDN_.µM. 5.43 6.53 5.70 5.28
TDP_.µM. 0.20 0.23 0.16 0.26
TSS_.mg.L. 0.43 0.13 0.08 0.05
SEAWATER_TEMPERATURE_2.5m_RV 27.78 27.16 30.16 24.40
SALINITY_2.5m_RV 35.27 35.55 34.72 35.16
FLUORESCENCE_2.5m_RV 0.10 0.11 0.32 0.09
# Data needs to be in long format
wq_mean_per_trip <- reshape2::melt(wq.all.reps.for.pca[,c(1, # Reef name
                                      6:19, # All numerical vals
                                      21, # Temperature
                                      22, # Salinity
                                      23, # Fluorescence
                                      20)]) %>% # Sampling_trip
  as.data.frame() %>% 
  group_by(Sampling_trip, variable) %>% 
# Now computing mean and SD
dplyr::summarize( # This tutorial for troubleshooting! https://stackoverflow.com/questions/46661461/calculate-mean-by-group-using-dplyr-package
          mean=round(mean(value, na.rm=TRUE),
                     digits = 2)
          ) %>%
  reshape2::dcast(variable~Sampling_trip)
# Showing as table
knitr::kable(wq_mean_per_trip, caption = "Mean for all 17 physico-chemical metrics, collapsed across the four sampling trips.")
Mean for all 17 physico-chemical metrics, collapsed across the four sampling trips.
variable Trip_01_Nov-Dec_2019 Trip_02_January_2020 Trip_03_February_2020 Trip_04_July_2020
Chlorophyll_a_.µg.L. 0.18 0.16 0.32 0.11
Phaeophytin_a_.µg.L. 0.18 0.20 0.36 0.10
PN_.µM. 1.23 1.27 1.32 0.50
POC_.µM. 8.06 7.60 9.95 3.66
PP_.µM. 0.05 0.05 0.07 0.02
DOC_.µM. 84.51 81.92 67.22 69.30
PO4_.µM. 0.05 0.04 0.02 0.10
NH4_.µM. 0.39 0.58 0.74 0.12
NO2_.µM. 0.03 0.04 0.04 0.01
NO3_.µM. 0.30 0.33 0.35 0.23
Si_.µM. 1.41 1.30 2.10 1.79
TDN_.µM. 5.47 6.62 5.64 5.18
TDP_.µM. 0.20 0.23 0.16 0.26
TSS_.mg.L. 0.48 0.15 0.35 0.12
SEAWATER_TEMPERATURE_2.5m_RV 27.78 27.13 30.01 24.22
SALINITY_2.5m_RV 35.35 35.52 34.71 35.16
FLUORESCENCE_2.5m_RV 0.10 0.10 0.34 0.13
# Data needs to be in long format
wq_sd_per_trip <- reshape2::melt(wq.all.reps.for.pca[,c(1, # Reef name
                                      6:19, # All numerical vals
                                      21, # Temperature
                                      22, # Salinity
                                      23, # Fluorescence
                                      20)]) %>% # Sampling_trip
  as.data.frame() %>% 
  group_by(Sampling_trip, variable) %>% 
# Now computing mean and SD
dplyr::summarize( # This tutorial for troubleshooting! https://stackoverflow.com/questions/46661461/calculate-mean-by-group-using-dplyr-package
          sd=round(sd(value, na.rm=TRUE),
                   digits = 2)
          ) %>%
  reshape2::dcast(variable~Sampling_trip)
# Showing as table
knitr::kable(wq_sd_per_trip, caption = "SD for all 17 physico-chemical metrics, collapsed across the four sampling trips.")
SD for all 17 physico-chemical metrics, collapsed across the four sampling trips.
variable Trip_01_Nov-Dec_2019 Trip_02_January_2020 Trip_03_February_2020 Trip_04_July_2020
Chlorophyll_a_.µg.L. 0.06 0.08 0.18 0.03
Phaeophytin_a_.µg.L. 0.04 0.08 0.15 0.02
PN_.µM. 0.35 0.46 0.22 0.10
POC_.µM. 2.86 1.89 2.29 1.00
PP_.µM. 0.02 0.02 0.03 0.01
DOC_.µM. 5.99 9.89 4.60 4.67
PO4_.µM. 0.03 0.02 0.02 0.02
NH4_.µM. 0.15 0.27 0.44 0.06
NO2_.µM. 0.02 0.01 0.02 0.00
NO3_.µM. 0.25 0.15 0.31 0.16
Si_.µM. 0.30 0.44 0.55 0.65
TDN_.µM. 0.83 0.82 0.72 0.75
TDP_.µM. 0.03 0.04 0.03 0.02
TSS_.mg.L. 0.41 0.15 0.52 0.10
SEAWATER_TEMPERATURE_2.5m_RV 0.43 0.61 0.39 0.95
SALINITY_2.5m_RV 0.21 0.17 0.05 0.04
FLUORESCENCE_2.5m_RV 0.01 0.02 0.05 0.12
# Exporting Medians as csv  
write.csv(wq_median_per_trip, file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Supplementary_Tables/Table_WQ_Median_per_trip.csv", quote = F, row.names = F)

# Exporting Means as csv
write.csv(wq_mean_per_trip, file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Supplementary_Tables/Table_WQ_mean_per_trip.csv", quote = F, row.names = F)

# Exporting SD as csv
write.csv(wq_sd_per_trip, file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Supplementary_Tables/Table_WQ_SD_per_trip.csv", quote = F, row.names = F)

# The csv files on median and sd values were merged manually to make the Table 1 in the main text of the manuscript

Microbial metagenomic data - collection, pre-processing, and analysis

Raw counts were exported from MEGAN as biom files separately for (1) microbial taxonomy (genus level as the lowest category) and for (2) microbial functions (GO terms), and subsequently imported into R using the phyloseq R package. These biom files were combined with the metadata file to create 2 phyloseq objects (for taxa and genes), which have then undergone various filtering steps.

### Importing the biom tables, exported from MEGAN

### Taxonomy info | at 'Genus' level
megan_genus <- import_biom("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/IMOS-MGD_Seawater_full_dataset_Genera_191_samples_Neg_controls_July_2024.biom")
### Functional info | GO terms
megan_GO_5 <- import_biom("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/IMOS-MGD_Seawater_GOs_Rank5_191_samples_Neg_controls_July_2024.biom")
# This one has 7476 GO terms
megan_GO_4 <- import_biom("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/IMOS-MGD_Seawater_GOs_Rank4_191_samples_Neg_controls_July_2024.biom")
# This one has 5257 GO terms
megan_GO_3 <- import_biom("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/IMOS-MGD_Seawater_GOs_Rank3_191_samples_Neg_controls_July_2024.biom")
# This one has 705 GO terms

# Let's just modify the metadata file a bit to include neg controls as well
metadata_neg_controls <- left_join(megan_genus@otu_table %>% 
                                     t() %>% 
                             as.data.frame() %>% 
                             rownames_to_column("Sample_ID") %>% 
                               dplyr::select("Sample_ID"),
                           metadata %>% 
                             rownames_to_column("Sample_ID")) %>% 
  column_to_rownames("Sample_ID")

# Merging:
sample_data(megan_genus) <- sample_data(metadata_neg_controls)
sample_data(megan_GO_5) <- sample_data(metadata_neg_controls)
sample_data(megan_GO_4) <- sample_data(metadata_neg_controls)
sample_data(megan_GO_3) <- sample_data(metadata_neg_controls)

# Checking the phyloseq objects
megan_genus
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2066 taxa and 207 samples ]
## sample_data() Sample Data:       [ 207 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 2066 taxa by 7 taxonomic ranks ]
megan_GO_5
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 7476 taxa and 207 samples ]
## sample_data() Sample Data:       [ 207 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 7476 taxa by 6 taxonomic ranks ]
megan_GO_4
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 5257 taxa and 207 samples ]
## sample_data() Sample Data:       [ 207 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 5257 taxa by 4 taxonomic ranks ]
megan_GO_3
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 706 taxa and 207 samples ]
## sample_data() Sample Data:       [ 207 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 706 taxa by 3 taxonomic ranks ]
# But I want to filter out the PF samples, and Broomfield rep 2 (because the sequencing was repeated for this one)
megan_genus <- subset_samples(megan_genus, sample_names(megan_genus)!='Lynchs-PF-1_S107_R1' &
                                sample_names(megan_genus)!='Lynchs-PF-2_S108_R1' &
                                sample_names(megan_genus)!='Lynchs-PF-3_S109_R1' &
                                sample_names(megan_genus)!='Lynchs-PF-4_S110_R1' &
                                sample_names(megan_genus)!='Myrmidon-PF-1_S111_R1' &
                                sample_names(megan_genus)!='Myrmidon-PF-2_S112_R1' &
                                sample_names(megan_genus)!='Myrmidon-PF-3_S113_R1' &
                                sample_names(megan_genus)!='Myrmidon-PF-4_S114_R1' &
                                sample_names(megan_genus)!='Rib-PF-1_S103_R1' &
                                sample_names(megan_genus)!='Rib-PF-2_S104_R1' &
                                sample_names(megan_genus)!='Rib-PF-3_S105_R1' &
                                sample_names(megan_genus)!='Rib-PF-4_S106_R1' &
                                sample_names(megan_genus)!='Broomfield-2_S50_R1')
megan_GO_5 <- subset_samples(megan_GO_5, sample_names(megan_GO_5)!='Lynchs-PF-1_S107_R1' &
                                sample_names(megan_GO_5)!='Lynchs-PF-2_S108_R1' &
                                sample_names(megan_GO_5)!='Lynchs-PF-3_S109_R1' &
                                sample_names(megan_GO_5)!='Lynchs-PF-4_S110_R1' &
                                sample_names(megan_GO_5)!='Myrmidon-PF-1_S111_R1' &
                                sample_names(megan_GO_5)!='Myrmidon-PF-2_S112_R1' &
                                sample_names(megan_GO_5)!='Myrmidon-PF-3_S113_R1' &
                                sample_names(megan_GO_5)!='Myrmidon-PF-4_S114_R1' &
                                sample_names(megan_GO_5)!='Rib-PF-1_S103_R1' &
                                sample_names(megan_GO_5)!='Rib-PF-2_S104_R1' &
                                sample_names(megan_GO_5)!='Rib-PF-3_S105_R1' &
                                sample_names(megan_GO_5)!='Rib-PF-4_S106_R1' &
                                sample_names(megan_GO_5)!='Broomfield-2_S50_R1')
megan_GO_4 <- subset_samples(megan_GO_4, sample_names(megan_GO_4)!='Lynchs-PF-1_S107_R1' &
                                sample_names(megan_GO_4)!='Lynchs-PF-2_S108_R1' &
                                sample_names(megan_GO_4)!='Lynchs-PF-3_S109_R1' &
                                sample_names(megan_GO_4)!='Lynchs-PF-4_S110_R1' &
                                sample_names(megan_GO_4)!='Myrmidon-PF-1_S111_R1' &
                                sample_names(megan_GO_4)!='Myrmidon-PF-2_S112_R1' &
                                sample_names(megan_GO_4)!='Myrmidon-PF-3_S113_R1' &
                                sample_names(megan_GO_4)!='Myrmidon-PF-4_S114_R1' &
                                sample_names(megan_GO_4)!='Rib-PF-1_S103_R1' &
                                sample_names(megan_GO_4)!='Rib-PF-2_S104_R1' &
                                sample_names(megan_GO_4)!='Rib-PF-3_S105_R1' &
                                sample_names(megan_GO_4)!='Rib-PF-4_S106_R1' &
                                sample_names(megan_GO_4)!='Broomfield-2_S50_R1')
megan_GO_3 <- subset_samples(megan_GO_3, sample_names(megan_GO_3)!='Lynchs-PF-1_S107_R1' &
                                sample_names(megan_GO_3)!='Lynchs-PF-2_S108_R1' &
                                sample_names(megan_GO_3)!='Lynchs-PF-3_S109_R1' &
                                sample_names(megan_GO_3)!='Lynchs-PF-4_S110_R1' &
                                sample_names(megan_GO_3)!='Myrmidon-PF-1_S111_R1' &
                                sample_names(megan_GO_3)!='Myrmidon-PF-2_S112_R1' &
                                sample_names(megan_GO_3)!='Myrmidon-PF-3_S113_R1' &
                                sample_names(megan_GO_3)!='Myrmidon-PF-4_S114_R1' &
                                sample_names(megan_GO_3)!='Rib-PF-1_S103_R1' &
                                sample_names(megan_GO_3)!='Rib-PF-2_S104_R1' &
                                sample_names(megan_GO_3)!='Rib-PF-3_S105_R1' &
                                sample_names(megan_GO_3)!='Rib-PF-4_S106_R1' &
                                sample_names(megan_GO_3)!='Broomfield-2_S50_R1')

# Checking the object again
megan_genus
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2066 taxa and 194 samples ]
## sample_data() Sample Data:       [ 194 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 2066 taxa by 7 taxonomic ranks ]
megan_GO_5
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 7476 taxa and 194 samples ]
## sample_data() Sample Data:       [ 194 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 7476 taxa by 6 taxonomic ranks ]
megan_GO_4
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 5257 taxa and 194 samples ]
## sample_data() Sample Data:       [ 194 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 5257 taxa by 4 taxonomic ranks ]
megan_GO_3
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 706 taxa and 194 samples ]
## sample_data() Sample Data:       [ 194 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 706 taxa by 3 taxonomic ranks ]
# These samples still include the 3 negative controls

Processing data

After removing the non pre-filtered samples, further data filtering included removal of reads (1) annotated as eukaryotic or viral; and (2) rare/spurious reads. Data was then Center-Log-Ratio (hereinafter ‘CLR’) transformed for statistical analysis in the mixOmics R package.

Removal of Eukaryotic contamination

We annotated a total of 1919 microbial taxa (lowest category: genus level). Reads that were annotated as Eukarya (729 taxa in total) and viruses (11 viral annotations) were excluded from the analysis. Further analysis was performed on a phyloseq object with prokaryotic annotations only, a total of 1179 bacterial and archaeal groups (Figure 2, Table 1).

# Before plotting the bar plots, I first need to prepare my objects
### Taxonomy info | at 'Genus' level
megan_genus_all <- megan_genus
megan_genus_TAX_all <- as.data.frame(megan_genus_all@tax_table)

# Plot admixture barplot - Domain level (and viruses)
cols_domain <- c(
  "d__Archaea" = "slategray3",          # Archaea
  "d__Bacteria" = "grey45",             # Bacteria
  "d__Eukaryota" = "salmon",            # Eukaryota
  "f__Mimiviridae" = "violetred",    # Family Mimiviridae
  "f__Phycodnaviridae" = "steelblue3",  # Family Phycodnaviridae
  "f__Retroviridae" = "lightsteelblue4", # Family Retroviridae
  "o__Caudovirales" = "seashell4"    # Order Caudovirales
)

DOMAIN <- as.data.frame(megan_genus_all@otu_table) %>% 
  rownames_to_column("OTUs") %>% # I will need this later to add taxonomy info
  left_join(megan_genus_TAX_all %>% rownames_to_column("OTUs")) %>% # adding taxonomy info
  column_to_rownames("OTUs") %>% 
  group_by(Rank1) %>% 
  # Keeping only numerical values now
  summarise_if(.predicate = function(x) is.numeric(x),
               .funs = funs(sum)) # Computing sums
# Now relative abundances
DOMAIN_RA <- DOMAIN
for (i in 2:(ncol(DOMAIN_RA))) {
  DOMAIN_RA[i] <- DOMAIN_RA[i] / sum(DOMAIN_RA[i]) 
}

barplots_domain <- DOMAIN_RA %>%
  column_to_rownames("Rank1") %>% 
  t() %>%
  as.data.frame() %>% 
  rownames_to_column("Sample_ID") %>% 
  reshape2::melt() %>% 
  left_join(metadata %>% rownames_to_column("Sample_ID")) %>% 
# Plotting now!
  ggplot(aes(x=Sample_ID, y=value, fill=variable))+
  geom_bar(stat = "identity")+
  scale_y_continuous(expand = c(0,0))+
  facet_wrap(~Sampling_trip, scales = "free", nrow = 5)+
#  facet_grid(~Sampling_trip, scales = "free_x", space = "free")+
  scale_fill_manual(values = cols_domain)+
  ylab("Relative abundance of taxa (at Domain level)")+
  xlab("Reef sites")+
  theme(axis.text.x = # element_blank(),
        element_text(angle = 75, hjust = 1, size = 12),
        #axis.ticks.x = element_blank(),
        #axis.title.x = element_blank(),
        strip.text = element_text(colour="black", size=12),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        legend.position = "right",
        legend.title = element_blank(),
        legend.text = element_text(size = 12))
barplots_domain

We see that we have eukaryotic reads, let’s see how many taxa?

megan_genus_bacteria <- subset_taxa(megan_genus, # Phyloseq object with all OTUs
                           Rank1=="d__Bacteria") # The phyloseq object with raw counts
megan_genus_archaea <- subset_taxa(megan_genus, # Phyloseq object with all OTUs
                                    Rank1=="d__Archaea") # The phyloseq object with raw counts

megan_genus_PROKS <- merge_phyloseq(megan_genus_bacteria,
                                    megan_genus_archaea) # Phyloseq object with Proks only
megan_genus_EUKS <- subset_taxa(megan_genus_all,
                          Rank1=="d__Eukaryota") # Phyloseq object with Euks only
knitr::kable(as.data.frame(cbind(as.character(ntaxa(megan_genus_EUKS)), as.character(ntaxa(megan_genus_bacteria)), as.character(ntaxa(megan_genus_archaea)), as.character(ntaxa(megan_genus_PROKS)))), caption = "Taxonomic breakdown", col.names = c("Eukaryota", "Bacteria", "Archaea", "Prokarya"))
Taxonomic breakdown
Eukaryota Bacteria Archaea Prokarya
774 1212 45 1257

If we compare with abundances of prokaryotes (which are the target for this study), are there any euks that are highly abundant?

megan_genus_all_with_euks <- import_biom("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/IMOS-MGD_Seawater_full_dataset_Genera_191_samples_Neg_controls_July_2024.biom")
# Merging!
sample_data(megan_genus_all_with_euks) <- sample_data(metadata_neg_controls)
# Removing the PF samples and Broomfield 2 (seq failed for this rep, and we have the repeated sample for rep 2)
megan_genus_all_with_euks <- subset_samples(megan_genus_all_with_euks, sample_names(megan_genus_all_with_euks)!='Lynchs-PF-1_S107_R1' &
                                sample_names(megan_genus_all_with_euks)!='Lynchs-PF-2_S108_R1' &
                                sample_names(megan_genus_all_with_euks)!='Lynchs-PF-3_S109_R1' &
                                sample_names(megan_genus_all_with_euks)!='Lynchs-PF-4_S110_R1' &
                                sample_names(megan_genus_all_with_euks)!='Myrmidon-PF-1_S111_R1' &
                                sample_names(megan_genus_all_with_euks)!='Myrmidon-PF-2_S112_R1' &
                                sample_names(megan_genus_all_with_euks)!='Myrmidon-PF-3_S113_R1' &
                                sample_names(megan_genus_all_with_euks)!='Myrmidon-PF-4_S114_R1' &
                                sample_names(megan_genus_all_with_euks)!='Rib-PF-1_S103_R1' &
                                sample_names(megan_genus_all_with_euks)!='Rib-PF-2_S104_R1' &
                                sample_names(megan_genus_all_with_euks)!='Rib-PF-3_S105_R1' &
                                sample_names(megan_genus_all_with_euks)!='Rib-PF-4_S106_R1' &
                                sample_names(megan_genus_all_with_euks)!='Broomfield-2_S50_R1')

# Removing the non-annotated stuff!
megan_genus_all_anno_only <- subset_taxa(megan_genus_all_with_euks, Rank2!="NA")
# Getting relative abundances too
megan_genus_all_RA = transform_sample_counts(megan_genus_all_with_euks, function(x) x / sum(x) )

# Selecting the top 100 most abundant MAGs (based on RA data)
megan_genus_top200_RA_abund_with_euks <- taxa_sums(megan_genus_all_RA) %>%
  sort(decreasing = TRUE) %>%
  head(200) %>% # Taking the first X most abundant taxa.
  # Change the number depending on how many Genera I want to look at
  names()

# Making a new phyloseq object
megan_genus_top200_RA_with_euks <- prune_taxa(megan_genus_top200_RA_abund_with_euks, # These are the top 20
                                  megan_genus_all_RA)

# Defining breaks - to make sure even very lowly abundant taxa will be visible!
# From Steve:
breaks=c(0,0.001,0.01,0.05,0.1,0.25,0.4,0.5,0.6,0.7,1)

# But I want to have less breaks
# breaks_5=c(0,0.001,0.1,0.3,0.7,1)
# Plot heatmap
left_join(otu_table(megan_genus_top200_RA_with_euks) %>% as.data.frame %>% rownames_to_column("OTU"),
          tax_table(megan_genus_top200_RA_with_euks) %>% as.data.frame %>% rownames_to_column("OTU")) %>%
  arrange(match(OTU, megan_genus_top200_RA_abund_with_euks)) %>% # Arranging by abundances here
  unite(taxonomy, c(OTU, Rank1, Rank2, Rank3, Rank4, Rank5, Rank6, Rank7), sep = "; ") %>% # Adding Taxonomy info
  gather(Sample_ID, Reads, -taxonomy) %>% # 'Reads' contains the Raw counts
#  left_join(as.data.frame(sample_data(megan_genus_all_RA)) %>% rownames_to_column("Sample_ID")) %>% # Now joining with the metadata
  left_join(metadata %>% rownames_to_column("Sample_ID")) %>% 
  # Ready to plot now!
  ggplot(aes(x = Sample_ID, # Short reef names on the x axis
             y = reorder(taxonomy, # Taxonomy info on the y axis
                         Reads), # With Taxa ordered based on abundances, most abundant listed first
             fill = Reads)) + # Change to 'Reads' if plotting the raw counts
  geom_tile() + # This colors the heatmap in blue & makes the more abundant taxa darker in color
  facet_grid(cols = vars(Sampling_trip), scales = "free_x", space = "free") + # Splitting in facets
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8)) + # Rotating the text at 90 degrees angle
  # Use this if I want a smaller number of breaks
#  scale_fill_stepsn(breaks = breaks_5, colours =c("white", # for the 0-0.001 RA range!
#                                                "slategray1", "slategray2", "slategray3", "slategray4")) +
  # Or from Steve:
  scale_fill_stepsn(breaks = breaks, colours =c("white", # for the 0-0.001 RA range!
                                                  "slategray4", # 001 - 0.01
                                                  "slategray3", # 0.01 - 0.05
                                                  "slategray2", # 0.05 - 0.1
                                                  "navajowhite", # 0.1 - 0.25
                                                  "rosybrown2", # 0.25,0.4
                                                  "lightsalmon", # 0.4 - 0.5
                                                  "rosybrown1", # 0.5 - 0.6
#                                                  "lightgoldenrod1", # 0.6 - 0.7
                                                  "indianred2")) + # 0.7,1
  scale_x_discrete(expand = c(0, 0)) +
  scale_y_discrete(expand = c(0, 0))

Removal of Rare/Spurious Reads - prokaryotes only

Prior to removal of rare and spurious reads, non-annotated reads were removed from the dataset. We then computed relative abundance values (RA) and removed reads with average RA < 0.0001% across samples. After removing OTUs that were less than 0.0001% abundant, we retained 618 taxa (primarily at Genus level) out of the initial 1179 prokaryotic OTUs. At functional level, we retained 5015 GO annotations (out of 8689 GO terms).

### IMPORTANT - ***Change this part of the script*** depending on which phyloseq object I would like to look at: prokaryotic, eukaryotic or all. This way I wouldn't need to modify multiple lines in the script below
megan_genus <- megan_genus_PROKS # options to choose from: megan_genus_PROKS, megan_genus_EUKS

# Cleaning the names here already! This way I will make sure every other phyloseq object will have organised taxonomy
megan_genus_TAX_PROKS <- as.data.frame(megan_genus@tax_table)

# Unite the names within one column called "Taxonomy"
megan_genus_TAX_PROKS <- megan_genus_TAX_PROKS %>% 
  unite(Taxonomy, c(Rank1, Rank2, Rank3, Rank4, Rank5, Rank6, Rank7), sep = "; ") # Adding Taxonomy info

# Initialize empty columns
megan_genus_TAX_PROKS$Domain <- NA
megan_genus_TAX_PROKS$Phylum <- NA
megan_genus_TAX_PROKS$Class <- NA
megan_genus_TAX_PROKS$Order <- NA
megan_genus_TAX_PROKS$Family <- NA
megan_genus_TAX_PROKS$Genus <- NA
megan_genus_TAX_PROKS$Species <- NA

# Categorise taxonomic strings based on patterns:
megan_genus_TAX_PROKS$Domain <- str_match(megan_genus_TAX_PROKS$Taxonomy, "^d__(.+?);")[, 2]
megan_genus_TAX_PROKS$Phylum <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; p__(.+?);")[, 2]
megan_genus_TAX_PROKS$Class <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; c__(.+?);")[, 2]
megan_genus_TAX_PROKS$Order <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; o__(.+?);")[, 2]
megan_genus_TAX_PROKS$Family <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; f__(.+?);")[, 2]
megan_genus_TAX_PROKS$Genus <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; g__(.+?);")[, 2]
megan_genus_TAX_PROKS$Species <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; s__(.+?);")[, 2]

# Last thing: replacing missing values with "Unknown_*"
megan_genus_TAX_PROKS$Domain[is.na(megan_genus_TAX_PROKS$Domain)] <- "NA"
megan_genus_TAX_PROKS$Phylum[is.na(megan_genus_TAX_PROKS$Phylum)] <- "NA"
megan_genus_TAX_PROKS$Class[is.na(megan_genus_TAX_PROKS$Class)] <- "NA"
megan_genus_TAX_PROKS$Order[is.na(megan_genus_TAX_PROKS$Order)] <- "NA"
megan_genus_TAX_PROKS$Family[is.na(megan_genus_TAX_PROKS$Family)] <- "NA"
megan_genus_TAX_PROKS$Genus[is.na(megan_genus_TAX_PROKS$Genus)] <- "NA"
megan_genus_TAX_PROKS$Species[is.na(megan_genus_TAX_PROKS$Species)] <- "NA"

# Remove the original taxonomy column
megan_genus_TAX_PROKS <- megan_genus_TAX_PROKS %>% 
  dplyr::select(Domain, Phylum, Class, Order, Family, Genus, Species)
# All cleaned up! :) thanks ChatGPT

### Putting this back into the phyloseq object:
# First checking the current taxonomic names in phyloseq object
current_taxa_names <- taxa_names(megan_genus)
# Compare with tax_table column names and order
polished_tax_table <- colnames(t(megan_genus_TAX_PROKS))
# Check if they match
if (!identical(current_taxa_names, polished_tax_table)) {
  stop("Polished taxonomic names in megan_genus_TAX_PROKS do not match the taxa_names in the megan_genus phyloseq object.")
}
# Looks like they match! So I'm not sure why I cannot merge them (code below)

# Check dimensions of tax_table and physeq - does the number of rows match?
nrow_tax_table <- nrow(megan_genus_TAX_PROKS)
ntaxa_physeq <-ntaxa(megan_genus)  # Number of taxa in physeq

if (nrow_tax_table != ntaxa_physeq) {
  stop("Number of rows in megan_genus_TAX_PROKS does not match the number of taxa in megan_genus phyloseq object.")
}

# Step 3: Compare order of unique values
identical(megan_genus_TAX_PROKS %>% # Looking for OTU order for the polishes taxa
            rownames_to_column("OTUs") %>%
            dplyr::select("OTUs"),
          megan_genus@tax_table %>%  # Looking for OTU order in the current phyloseqq object
            as.data.frame() %>% 
            rownames_to_column("OTUs") %>% 
            dplyr::select("OTUs"))
## [1] TRUE
# Again, this is also the same

# Here too:
identical(row.names(megan_genus_TAX_PROKS),
          row.names(otu_table(megan_genus))
          )
## [1] TRUE
# Yes, the row names are identical

# Now adding this polished taxonomy to my phyloseq object:
tax_table(megan_genus) <- as.matrix(megan_genus_TAX_PROKS)

### Removing the negative controls too - taxa phyloseq object:
megan_genus_no_neg_control <- subset_samples(megan_genus,
                                             sample_names(megan_genus)!='Neg-control-1_S101_R1' &
                                               sample_names(megan_genus)!='Neg-control-2_S24_R1' &
                                               sample_names(megan_genus)!='Neg-control-3_S116_R1')

### Removing the negative controls too - GOs at rank 5 phyloseq object:
megan_GO_5_no_neg_control <- subset_samples(megan_GO_5,
                                             sample_names(megan_GO_5)!='Neg-control-1_S101_R1' &
                                               sample_names(megan_GO_5)!='Neg-control-2_S24_R1' &
                                               sample_names(megan_GO_5)!='Neg-control-3_S116_R1')

### Removing the negative controls too - GOs at rank 4 phyloseq object:
megan_GO_4_no_neg_control <- subset_samples(megan_GO_4,
                                             sample_names(megan_GO_4)!='Neg-control-1_S101_R1' &
                                               sample_names(megan_GO_4)!='Neg-control-2_S24_R1' &
                                               sample_names(megan_GO_4)!='Neg-control-3_S116_R1')

### Removing the negative controls too - GOs at rank 3 phyloseq object:
megan_GO_3_no_neg_control <- subset_samples(megan_GO_3,
                                             sample_names(megan_GO_3)!='Neg-control-1_S101_R1' &
                                               sample_names(megan_GO_3)!='Neg-control-2_S24_R1' &
                                               sample_names(megan_GO_3)!='Neg-control-3_S116_R1')
### Instead of setting an arbitrary threshold (e.g 100 seqs), I would like to filter based on relative abundances (***removing all OTUs < 0.0001% rel. abundance***)

# Tutorial I used: https://joey711.github.io/phyloseq/preprocess.html

### Removing reads that are annotated at Bacteria or Archaea levels only - not informative!
megan_genus_anno_only <- subset_taxa(megan_genus_no_neg_control, Phylum!="NA")

### Removing reads that were not annotated at Rank2 level - not informative!
megan_GO_5_anno_only <- subset_taxa(megan_GO_5_no_neg_control, Rank2!="NA")
megan_GO_4_anno_only <- subset_taxa(megan_GO_4_no_neg_control, Rank2!="NA")
megan_GO_3_anno_only <- subset_taxa(megan_GO_3_no_neg_control, Rank2!="NA")

# Getting the taxa data frame
megan_genus_TAX <- as.data.frame(megan_genus_anno_only@tax_table)
# Getting the taxa data frame
megan_GO_5_FUN <- as.data.frame(megan_GO_5_anno_only@tax_table)
megan_GO_4_FUN <- as.data.frame(megan_GO_4_anno_only@tax_table)
megan_GO_3_FUN <- as.data.frame(megan_GO_3_anno_only@tax_table)

### Getting the relative abundances
# Taxa
megan_genus_RA = transform_sample_counts(megan_genus_anno_only, function(x) x / sum(x) )
# GO terms
megan_GO_5_RA  = transform_sample_counts(megan_GO_5_anno_only, function(x) x / sum(x) )
megan_GO_4_RA  = transform_sample_counts(megan_GO_4_anno_only, function(x) x / sum(x) )
megan_GO_3_RA  = transform_sample_counts(megan_GO_3_anno_only, function(x) x / sum(x) )
# removing all OTUs that are less than 0.0001% abundant
megan_genus_RA_no_rare = filter_taxa(megan_genus_RA, function(x) mean(x) > 1e-6, TRUE)
# removing all genes that are less than 0.0001% abundant
megan_GO_5_RA_no_rare = filter_taxa(megan_GO_5_RA, function(x) mean(x) > 1e-6, TRUE)
megan_GO_3_RA_no_rare = filter_taxa(megan_GO_3_RA, function(x) mean(x) > 1e-6, TRUE)
megan_GO_4_RA_no_rare = filter_taxa(megan_GO_4_RA, function(x) mean(x) > 1e-6, TRUE)
Before_after_filtering <- cbind(rbind(ntaxa(megan_genus), ntaxa(megan_genus_RA_no_rare)),
                                rbind(ntaxa(megan_GO_5), ntaxa(megan_GO_5_RA_no_rare))
#                                rbind(ntaxa(megan_COGs), ntaxa(megan_COGs_RA_no_rare))
                                ) %>% 
  as.data.frame()
# Adding row names now
row.names(Before_after_filtering) <- c("Before filtering", "After filtering")

knitr::kable(Before_after_filtering, caption = "Removal of Rare/Spurious reads (< 0.0001% RA)", col.names = c("Taxa", "GO terms"), row.names = T)
Removal of Rare/Spurious reads (< 0.0001% RA)
Taxa GO terms
Before filtering 1257 7476
After filtering 621 4287
megan_genus_abundant <- prune_taxa(taxa_names(megan_genus_RA_no_rare), # List of OTUs after filtering
                                megan_genus_no_neg_control) # My phyloseq object with raw counts
megan_GO_5_abundant <- prune_taxa(taxa_names(megan_GO_5_RA_no_rare), # List of OTUs after filtering
                                megan_GO_5_no_neg_control) # My phyloseq object with raw counts
megan_GO_4_abundant <- prune_taxa(taxa_names(megan_GO_4_RA_no_rare), # List of OTUs after filtering
                                megan_GO_4_no_neg_control) # My phyloseq object with raw counts
megan_GO_3_abundant <- prune_taxa(taxa_names(megan_GO_3_RA_no_rare), # List of OTUs after filtering
                                megan_GO_3_no_neg_control) # My phyloseq object with raw counts
# CLR is the normalisation method suggested by the mixOmics R package for microbial data - a way to address missing values that are characteristic of microbial datasets. I need to remove missing values before doing the CLR normalisation - The geometric mean cannot be determined for sparse data without deleting, replacing or estimating the 0 count values. So I am introducing pseudo counts

### Tutorial used: http://mixomics.org/mixmc/mixmc-preprocessing/

# Checking if there are any zeros - BEFORE adding pseudocounts
sum(which(megan_genus_abundant@otu_table == 0))
## [1] 4791141201
sum(which(megan_GO_5_abundant@otu_table == 0))
## [1] 43240551782
# sum(which(megan_COGs_abundant@otu_table == 0))
# Pseudocounts - replacing all zero vals with 1; 
megan_genus_abundant@otu_table <- megan_genus_abundant@otu_table + 1
megan_GO_5_abundant@otu_table <- megan_GO_5_abundant@otu_table +1
megan_GO_3_abundant@otu_table <- megan_GO_3_abundant@otu_table +1
megan_GO_4_abundant@otu_table <- megan_GO_4_abundant@otu_table +1
# megan_COGs_abundant@otu_table <- megan_COGs_abundant@otu_table + 1
# Checking if there are any zeros - AFTER adding pseudocounts
sum(which(megan_genus_abundant@otu_table == 0))
## [1] 0
sum(which(megan_GO_5_abundant@otu_table == 0))
## [1] 0
# sum(which(megan_COGs_abundant@otu_table == 0))
# All good! No NAs after introducing pseudocounts

### Now I can CLR transform when running analyses in mixOmics!
# I am using an option from the microbiome R package, not the same as in MixOmics.
megan_genus_clr <- microbiome::transform(megan_genus_abundant, "clr")
megan_go_clr_5 <- microbiome::transform(megan_GO_5_abundant, "clr")
megan_go_clr_3 <- microbiome::transform(megan_GO_3_abundant, "clr")
megan_go_clr_4 <- microbiome::transform(megan_GO_4_abundant, "clr")
# megan_COGs_clr <- microbiome::transform(megan_COGs_abundant, "clr")

# megan_go_clr_3_bp <- megan_go_clr_3 %>% 
#  subset_taxa(Rank2 == 'GO:0008150 biological_process')
# megan_go_clr_4_bp <- megan_go_clr_4 %>% 
#  subset_taxa(Rank2 == 'GO:0008150 biological_process')
# But for GO at lvl 3, I only want bio process
# save.image("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/Code_for_Terzin_et_al_Microbial_Function_Outperforms_Taxonomy_in_Inferring_Water_Chemistry_across_the_Great_Barrier_Reef.RData")

Principal Components Analysis (PCA) - What are the main clustering patterns between our reefs based on physico-chemical data?

# Preparing the object to have taxa names on boxplots
OTUs_biplot <- as.data.frame(megan_genus_clr@otu_table) %>% 
  t() # mixOmics needs samples and microbes to be reordered, so transposing here
# Check dimensions of data
dim(OTUs_biplot)
## [1] 191 561
class(OTUs_biplot)
## [1] "matrix" "array"
# Getting taxa names
OTUs_biplot_colnames_for_biplot <- left_join(otu_table(megan_genus_clr) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU"),
                                  tax_table(megan_genus_clr) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU")) %>%
  unite(taxonomy, c(Family, Genus), sep = "; ") # Adding Taxonomy info
## Joining, by = "OTU"
OTUs_biplot_colnames_for_biplot <- OTUs_biplot_colnames_for_biplot %>% 
  dplyr::select("OTU", "taxonomy")
# Merging with the OTUs_biplot object
OTUs_biplot_names <- left_join(t(OTUs_biplot) %>% 
                           as.data.frame() %>% 
                           rownames_to_column("OTU"),
                         OTUs_biplot_colnames_for_biplot) %>% 
  unite(Annotations, c(OTU, taxonomy), sep = "_") %>% 
  column_to_rownames("Annotations") %>%  # moving this as rowposing back into the right format
  t() # trans

# PCA
result.pca.taxa.names <- pca(OTUs_biplot_names)

# Plotting the PCA sample plot
plotIndiv(result.pca.taxa.names,
          group = sample_data(megan_genus_abundant)$Sampling_trip,
          title = 'PCA | Microbial Taxonomy',
          legend = T,
          ellipse = TRUE,
          ind.names = F,
          col.per.group =c("indianred", # Sampling trip 1
                "indianred4", # Sampling trip 2 
                "red3", # Sampling trip 3
                "slateblue"), # Sampling trip 4
          legend.title = 'Sampling trip'
          )

# Plotting the PCA biplot
biplot(result.pca.taxa.names,
       comp = c(1, 2),
       group = sample_data(megan_genus_abundant)$Sampling_trip,
       ind.names = F,
       ellipse = T,
       col.per.group =c("indianred", # Sampling trip 1
                        "indianred4", # Sampling trip 2
                        "red3", # Sampling trip 3
                        "slateblue"), # Sampling trip 4
       legend = TRUE,
       vline = T,
       hline = T,
       cutoff = 0.65,
       legend.title = "Sampling trip")

# Parameter tuning
tune.pca.taxa <- tune.pca(OTUs_biplot_names, ncomp = 10, scale = TRUE)
plot(tune.pca.taxa)
Screeplot from the PCA performed on the IMOS GBR-MGD metagenomics data (microbial taxonomy): Amount of explained variance for each principal component is shown. From the numerical output (shown bellow in tabular format), we observe that the first two principal components explain 60.31% of the total variance. The rule of thumb for choosing the number of PCA components is not so much to set a hard threshold based on the cumulative proportion of explained variance (as this is data-dependent), but to observe when a drop, or elbow, appears on the screeplot. The elbow indicates that the remaining variance is spread over many principal components and is not relevant in obtaining a low-dimensional ‘snapshot’ of the data. Based on this, we chose to keep two PCA dimensions.

Screeplot from the PCA performed on the IMOS GBR-MGD metagenomics data (microbial taxonomy): Amount of explained variance for each principal component is shown. From the numerical output (shown bellow in tabular format), we observe that the first two principal components explain 60.31% of the total variance. The rule of thumb for choosing the number of PCA components is not so much to set a hard threshold based on the cumulative proportion of explained variance (as this is data-dependent), but to observe when a drop, or elbow, appears on the screeplot. The elbow indicates that the remaining variance is spread over many principal components and is not relevant in obtaining a low-dimensional ‘snapshot’ of the data. Based on this, we chose to keep two PCA dimensions.

# Numerical output
pca.taxa.num <- pca(OTUs_biplot_names, # getting the numerical values only
                       ncomp = 10,
                       center = TRUE,
                       scale = TRUE)

# Explained variance per PCA component
knitr::kable(pca.taxa.num$prop_expl_var$X, caption = "The proportion of explained variance per each PCA component is:")
The proportion of explained variance per each PCA component is:
x
PC1 0.2030513
PC2 0.1235319
PC3 0.0686204
PC4 0.0537983
PC5 0.0456617
PC6 0.0431242
PC7 0.0303674
PC8 0.0226126
PC9 0.0216532
PC10 0.0178213
# The cumulative proportion of variance explained by each PCA component
knitr::kable(pca.taxa.num$cum.var, caption = "The cumulative proportion of variance explained by each PCA component")
The cumulative proportion of variance explained by each PCA component
x
PC1 0.2030513
PC2 0.3265833
PC3 0.3952037
PC4 0.4490020
PC5 0.4946637
PC6 0.5377879
PC7 0.5681553
PC8 0.5907679
PC9 0.6124211
PC10 0.6302423
# Preparing the object to have taxa names on boxplots
GOs_biplot <- as.data.frame(megan_go_clr_5@otu_table) %>% 
  t() # mixOmics needs samples and microbes to be reordered, so transposing here
# Check dimensions of data
dim(GOs_biplot)
## [1]  191 4287
class(GOs_biplot)
## [1] "matrix" "array"
# Getting gene names
GOs_biplot_colnames_for_biplot <- left_join(otu_table(megan_go_clr_5) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU"),
                                  tax_table(megan_go_clr_5) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU")) %>%
  unite(Gene_annotations, c(Rank4), sep = "; ") # Adding Taxonomy info
## Joining, by = "OTU"
GOs_biplot_colnames_for_biplot <- GOs_biplot_colnames_for_biplot %>% 
  dplyr::select("OTU", "Gene_annotations")
# Merging with the OTUs_biplot object
GOs_biplot_names <- left_join(t(GOs_biplot) %>% 
                           as.data.frame() %>% 
                           rownames_to_column("OTU"),
                         GOs_biplot_colnames_for_biplot) %>% 
  unite(Annotations, c(OTU, Gene_annotations), sep = "_") %>% 
  column_to_rownames("Annotations") %>%  # moving this as rowposing back into the right format
  t() # trans

# PCA
result.pca.GOs.names <- pca(GOs_biplot_names)

# Plotting the PCA sample plot
plotIndiv(result.pca.GOs.names,
          group = sample_data(megan_GO_5_abundant)$Sampling_trip,
          title = 'PCA | Microbial Functions',
          legend = T,
          ellipse = TRUE,
          ind.names = F,
          col.per.group =c("indianred", # Sampling trip 1
                "indianred4", # Sampling trip 2 
                "red3", # Sampling trip 3
                "slateblue"), # Sampling trip 4
          legend.title = 'Sampling trip'
          )

# Plotting the PCA biplot
biplot(result.pca.GOs.names,
       comp = c(1, 2),
       group = sample_data(megan_GO_5_abundant)$Sampling_trip,
       ind.names = F,
       ellipse = T,
       col.per.group =c("indianred", # Sampling trip 1
                        "indianred4", # Sampling trip 2
                        "red3", # Sampling trip 3
                        "slateblue"), # Sampling trip 4
       legend = TRUE,
       vline = T,
       hline = T,
       cutoff = 0.95,
       legend.title = "Sampling trip")

# Parameter tuning
tune.pca.GOs <- tune.pca(GOs_biplot_names, ncomp = 10, scale = TRUE)
plot(tune.pca.GOs)
Screeplot from the PCA performed on the IMOS GBR-MGD metagenomics data (microbial genes - GO terms): Amount of explained variance for each principal component is shown. From the numerical output (shown bellow in tabular format), we observe that the first two principal components explain 60.31% of the total variance. The rule of thumb for choosing the number of PCA components is not so much to set a hard threshold based on the cumulative proportion of explained variance (as this is data-dependent), but to observe when a drop, or elbow, appears on the screeplot. The elbow indicates that the remaining variance is spread over many principal components and is not relevant in obtaining a low-dimensional ‘snapshot’ of the data. Based on this, we chose to keep two PCA dimensions.

Screeplot from the PCA performed on the IMOS GBR-MGD metagenomics data (microbial genes - GO terms): Amount of explained variance for each principal component is shown. From the numerical output (shown bellow in tabular format), we observe that the first two principal components explain 60.31% of the total variance. The rule of thumb for choosing the number of PCA components is not so much to set a hard threshold based on the cumulative proportion of explained variance (as this is data-dependent), but to observe when a drop, or elbow, appears on the screeplot. The elbow indicates that the remaining variance is spread over many principal components and is not relevant in obtaining a low-dimensional ‘snapshot’ of the data. Based on this, we chose to keep two PCA dimensions.

# Numerical output
pca.GOs.num <- pca(GOs_biplot_names, # getting the numerical values only
                       ncomp = 10,
                       center = TRUE,
                       scale = TRUE)

# Explained variance per PCA component
knitr::kable(pca.GOs.num$prop_expl_var$X, caption = "The proportion of explained variance per each PCA component is:")
The proportion of explained variance per each PCA component is:
x
PC1 0.3436569
PC2 0.1184468
PC3 0.0592101
PC4 0.0477584
PC5 0.0386741
PC6 0.0292849
PC7 0.0203148
PC8 0.0186514
PC9 0.0143016
PC10 0.0118723
# The cumulative proportion of variance explained by each PCA component
knitr::kable(pca.GOs.num$cum.var, caption = "The cumulative proportion of variance explained by each PCA component")
The cumulative proportion of variance explained by each PCA component
x
PC1 0.3436569
PC2 0.4621037
PC3 0.5213138
PC4 0.5690722
PC5 0.6077463
PC6 0.6370312
PC7 0.6573461
PC8 0.6759974
PC9 0.6902990
PC10 0.7021713

Parameter tuning in mixOmics to identify the optimal number of principal components (PCs) showed that the variance explained by adding more than 2 PCs is insignificant for both taxonomy and function. Hence, 2 PCs were retained. PCA clustering identified a clear difference between summer and winter samples, for both taxonomy and function. However, this clustering becomes more evident at functional compared to taxonomic levels. The percentage of variance explained by the first 2 Principal components (PCs) equaled to ~26% for taxonomy, and 55% for functions (GO terms). A PERMANOVA test was then carried out to investigate which comparisons are statistically significant.

PERMANOVA

Analysis of similarities (ANOSIM) testing whether there is a statistically significant difference between two or more groups of sampling units - sampling trips. We will then perform a Pairwise PERMANOVA.

taxa.anosim <- left_join(otu_table(megan_genus_RA_no_rare) %>%
                           as.data.frame %>%
                           rownames_to_column("OTU"),
                         megan_genus_TAX %>%
                           rownames_to_column("OTU")) %>% 
  unite(taxonomy, c(OTU, Domain, Phylum, Class, Order, Family, Genus, Species), sep = "; ") %>% 
  column_to_rownames("taxonomy")
# Removing rows with NAs, because ANOSIM does not take in missing vals
taxa.anosim <- na.omit(taxa.anosim)

# Object is ready to perform the test
ano_taxa <- anosim(t(taxa.anosim), 
                   sample_data(megan_genus_RA_no_rare)$Sampling_trip, 
                   distance = "bray", 
                   permutations = 9999)
# Results
ano_taxa
## 
## Call:
## anosim(x = t(taxa.anosim), grouping = sample_data(megan_genus_RA_no_rare)$Sampling_trip,      permutations = 9999, distance = "bray") 
## Dissimilarity: bray 
## 
## ANOSIM statistic R: 0.2244 
##       Significance: 1e-04 
## 
## Permutation: free
## Number of permutations: 9999

Pairwise PERMANOVA - taxa

Visualisation

Phylum level

What are the most abundant phyla?

Out of the 29 bacterial and archaeal phyla we identified, the most abundant phyla were Cyanobacteria, Proteobacteria, and Bacteroidetes, respectively. Bacteroidetes increased in abundance for those samples collected during the peak of summer (February 2020), and were lowest in abundances during winter (July 2020).

# Before plotting the bar plots, I first need to prepare my objects
taxa.barplots_phylum <- left_join(otu_table(megan_genus_abundant) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU"),
                                  tax_table(megan_genus_abundant) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU"))
## Joining with `by = join_by(OTU)`
# Now setting OTUs as row names
rownames(taxa.barplots_phylum) <- taxa.barplots_phylum[,1]
taxa.barplots_phylum[,1] <- NULL

# Now summarising raw counts at phylum level
taxa.barplots_phylum.sum <- ddply(taxa.barplots_phylum, "Phylum", numcolwise(sum))

# Ready to compute raw abundances per sample - by dividing cell value with column sum
taxa.barplots_phylum.sum_RA <- taxa.barplots_phylum.sum
for (i in 2:(ncol(taxa.barplots_phylum.sum))) { # '2:' as the first column is not numeric
  taxa.barplots_phylum.sum_RA[i] <- taxa.barplots_phylum.sum_RA[i] / sum(taxa.barplots_phylum.sum_RA[i]) 
}
# taxa.barplots_phylum.sum_RA

# Now setting row names - RA
taxa.barplots_phylum.sum_RA <- taxa.barplots_phylum.sum_RA %>% remove_rownames %>% column_to_rownames(var="Phylum")
taxa.barplots_phylum_transposed_RA <- t(taxa.barplots_phylum.sum_RA)

# Now setting back the col names before melting - RA
taxa.barplots_phylum_transposed_RA <- tibble::rownames_to_column(as.data.frame(taxa.barplots_phylum_transposed_RA), "Sample_ID")
taxa.barplots_phylum_transposed_melt_RA = reshape2::melt(taxa.barplots_phylum_transposed_RA, id.vars=c("Sample_ID"))

# At last, we add metadata_barplots info so that I can make facets in the plot - RA
taxa.barplots_phylum_transposed_melt_RA <- left_join(taxa.barplots_phylum_transposed_melt_RA,
                                                     metadata %>% rownames_to_column("Sample_ID"))
## Joining with `by = join_by(Sample_ID)`
# Setting colors - hardcoding!
cols_phyla <- c(
  "Acidobacteria" = "skyblue2",                      # Acidobacteria
  "Actinobacteria" = "slateblue4",                   # Actinobacteria
  "Bacteroidetes" = "salmon1",                       # Bacteroidetes
  "Balneolaeota" = "plum2",                          # Balneolaeota
  "Candidatus Kaiserbacteria" = "skyblue1",          # Candidatus Kaiserbacteria
  "Candidatus Marinimicrobia" = "slategray4",        # Candidatus Marinimicrobia
  "Candidatus Peregrinibacteria" = "lavenderblush3", # Candidatus Peregrinibacteria
  "Candidatus Tectomicrobia" = "tomato1",            # Candidatus Tectomicrobia
  "Candidatus Thermoplasmatota" = "lightgoldenrod1", # Candidatus Thermoplasmatota
  "Chlamydiae" = "olivedrab",                        # Chlamydiae
  "Chlorobi" = "seagreen3",                          # Chlorobi
  "Chloroflexi" = "slateblue3",                      # Chloroflexi
  "Cyanobacteria" = "darkseagreen3",                 # Cyanobacteria
  "Deinococcus-Thermus" = "rosybrown4",              # Deinococcus-Thermus
  "Euryarchaeota" = "violetred1",                    # Euryarchaeota
  "Fibrobacteres" = "navajowhite2",                  # Fibrobacteres
  "Firmicutes" = "indianred",                        # Firmicutes
  "Fusobacteria" = "skyblue3",                       # Fusobacteria
  "Gemmatimonadetes" = "tomato2",                    # Gemmatimonadetes
  "Lentisphaerae" = "lightyellow",                   # Lentisphaerae
  "Nitrospinae" = "khaki",                           # Nitrospinae
  "Nitrospirae" = "rosybrown",                       # Nitrospirae
  "Planctomycetes" = "mediumpurple1",                # Planctomycetes
  "Proteobacteria" = "lightblue",                    # Proteobacteria
  "Rhodothermaeota" = "tomato3",                     # Rhodothermaeota
  "Spirochaetes" = "wheat1",                         # Spirochaetes
  "Tenericutes" = "palegoldenrod",                   # Tenericutes
  "Thaumarchaeota" = "plum4",                        # Thaumarchaeota
  "Verrucomicrobia" = "tan1"                        # Verrucomicrobia
)

# Ready to plot
ggplot(data=taxa.barplots_phylum_transposed_melt_RA,
                                 aes(x=Sample_ID,
                                     y=value,
                                     fill=variable))+
  geom_bar(stat = "identity")+
  scale_y_continuous(expand = c(0,0))+
  facet_wrap(~Sampling_trip, scales = "free", nrow = 4)+
#  scale_fill_manual(values = cols)+
#  facet_wrap(~Sampling_trip, scales = "free", nrow = 4)+
#  facet_grid(~Sampling_trip, scales = "free_x", space = "free")+
  scale_fill_manual(values = cols_phyla)+
  ylab("Relative abundance of taxa (at Phylum level)")+
  xlab("Reef sites")+
  theme(axis.text.x = element_text(angle = 75, hjust = 1, size = 12),
        #axis.ticks.x = element_blank(),
        #axis.title.x = element_blank(),
        strip.text = element_text(colour="black", size=12),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        legend.position = "right",
        legend.title = element_blank(),
        legend.text = element_text(size = 12))

# ggarrange(admix.bar_data, admix.bar_data_RA,
#          ncol = 1, nrow = 2)
# Group by mean using R Base aggregate()
phylum_mean <- aggregate(taxa.barplots_phylum_transposed_melt_RA$value, by=list(taxa.barplots_phylum_transposed_melt_RA$variable), FUN=mean)
# Checking that the row sums will be 1
sum(phylum_mean$x)
## [1] 1
# It worked

# Printing as table now, by sorting the values too.
knitr::kable(arrange(phylum_mean, desc(x)), caption = "Mean relative abundances at Phylum level, across all samples. We observe that 47.27% of the reads cannot be annotated bellow the Phylum level.")
Mean relative abundances at Phylum level, across all samples. We observe that 47.27% of the reads cannot be annotated bellow the Phylum level.
Group.1 x
Cyanobacteria 0.6817959
Proteobacteria 0.2612396
Bacteroidetes 0.0253382
Actinobacteria 0.0159741
Planctomycetes 0.0048641
Firmicutes 0.0039481
Verrucomicrobia 0.0021430
Balneolaeota 0.0018664
Thaumarchaeota 0.0006134
Spirochaetes 0.0005483
Euryarchaeota 0.0004341
Candidatus Thermoplasmatota 0.0002843
Fusobacteria 0.0001528
Lentisphaerae 0.0001510
Rhodothermaeota 0.0001400
Tenericutes 0.0001209
Nitrospinae 0.0000852
Candidatus Tectomicrobia 0.0000550
Nitrospirae 0.0000548
Acidobacteria 0.0000537
Chloroflexi 0.0000419
Deinococcus-Thermus 0.0000264
Candidatus Peregrinibacteria 0.0000184
Chlamydiae 0.0000158
Candidatus Kaiserbacteria 0.0000110
Gemmatimonadetes 0.0000106
Fibrobacteres 0.0000058
Candidatus Marinimicrobia 0.0000051
Chlorobi 0.0000020
OTUs_non_annotated_phyla_to_remove <- c("1869227", "2", "2157")
megan_genus_abundant_known_phyla_only <- subset_taxa(megan_genus_abundant,
                                                     !taxa_names(megan_genus_abundant) %in% OTUs_non_annotated_phyla_to_remove)
# Before plotting the bar plots, I first need to prepare my objects
taxa.barplots_phylum_known <- left_join(otu_table(megan_genus_abundant_known_phyla_only) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU"),
                                  tax_table(megan_genus_abundant_known_phyla_only) %>%
                                    as.data.frame() %>% 
                                    rownames_to_column("OTU")) %>% 
  column_to_rownames("OTU")
## Joining with `by = join_by(OTU)`
# Now summarising raw counts at phylum level
taxa.barplots_phylum_known.sum <- ddply(taxa.barplots_phylum_known, "Phylum", numcolwise(sum))

# Ready to compute raw abundances per sample - by dividing cell value with column sum
taxa.barplots_phylum_known.sum_RA <- taxa.barplots_phylum_known.sum
for (i in 2:(ncol(taxa.barplots_phylum_known.sum))) { # '2:' as the first column is not numeric
  taxa.barplots_phylum_known.sum_RA[i] <- taxa.barplots_phylum_known.sum_RA[i] / sum(taxa.barplots_phylum_known.sum_RA[i]) 
}
# taxa.barplots_phylum.sum_RA

# Now setting row names - RA
taxa.barplots_phylum_known.sum_RA <- taxa.barplots_phylum_known.sum_RA %>% remove_rownames %>% column_to_rownames(var="Phylum")
taxa.barplots_phylum_known_transposed_RA <- t(taxa.barplots_phylum_known.sum_RA)

# Now setting back the col names before melting - RA
taxa.barplots_phylum_known_transposed_RA <- tibble::rownames_to_column(as.data.frame(taxa.barplots_phylum_known_transposed_RA), "Sample_ID")
taxa.barplots_phylum_known_transposed_melt_RA = reshape2::melt(taxa.barplots_phylum_known_transposed_RA, id.vars=c("Sample_ID"))

# At last, we add metadata_barplots info so that I can make facets in the plot - RA
taxa.barplots_phylum_known_transposed_melt_RA <- left_join(taxa.barplots_phylum_known_transposed_melt_RA,
                                                     metadata %>% rownames_to_column("Sample_ID"))
## Joining with `by = join_by(Sample_ID)`
# Setting colors - hardcoding!
cols_phyla_known <- c(
  "Acidobacteria" = "skyblue2",                      # Acidobacteria
  "Actinobacteria" = "slateblue4",                   # Actinobacteria
  "Bacteroidetes" = "salmon1",                       # Bacteroidetes
  "Balneolaeota" = "plum2",                          # Balneolaeota
  "Candidatus Kaiserbacteria" = "skyblue1",          # Candidatus Kaiserbacteria
  "Candidatus Marinimicrobia" = "slategray4",        # Candidatus Marinimicrobia
  "Candidatus Peregrinibacteria" = "lavenderblush3", # Candidatus Peregrinibacteria
  "Candidatus Tectomicrobia" = "tomato1",            # Candidatus Tectomicrobia
  "Candidatus Thermoplasmatota" = "lightgoldenrod1", # Candidatus Thermoplasmatota
  "Chlamydiae" = "olivedrab",                        # Chlamydiae
  "Chlorobi" = "seagreen3",                          # Chlorobi
  "Chloroflexi" = "slateblue3",                      # Chloroflexi
  "Cyanobacteria" = "darkseagreen3",                 # Cyanobacteria
  "Deinococcus-Thermus" = "rosybrown4",              # Deinococcus-Thermus
  "Euryarchaeota" = "violetred1",                    # Euryarchaeota
  "Fibrobacteres" = "navajowhite2",                  # Fibrobacteres
  "Firmicutes" = "indianred",                        # Firmicutes
  "Fusobacteria" = "skyblue3",                       # Fusobacteria
  "Gemmatimonadetes" = "tomato2",                    # Gemmatimonadetes
  "Lentisphaerae" = "lightyellow",                   # Lentisphaerae
  "Nitrospinae" = "khaki",                           # Nitrospinae
  "Nitrospirae" = "rosybrown",                       # Nitrospirae
  "Planctomycetes" = "mediumpurple1",                # Planctomycetes
  "Proteobacteria" = "lightblue",                    # Proteobacteria
  "Rhodothermaeota" = "tomato3",                     # Rhodothermaeota
  "Spirochaetes" = "wheat1",                         # Spirochaetes
  "Tenericutes" = "palegoldenrod",                   # Tenericutes
  "Thaumarchaeota" = "plum4",                        # Thaumarchaeota
  "Verrucomicrobia" = "tan1"                        # Verrucomicrobia
)

# Ready to plot
ggplot(data=taxa.barplots_phylum_known_transposed_melt_RA,
                                 aes(x=Sample_ID,
                                     y=value,
                                     fill=variable))+
  geom_bar(stat = "identity")+
  scale_y_continuous(expand = c(0,0))+
  facet_wrap(~Sampling_trip, scales = "free", nrow = 4)+
#  scale_fill_manual(values = cols)+
#  facet_wrap(~Sampling_trip, scales = "free", nrow = 4)+
#  facet_grid(~Sampling_trip, scales = "free_x", space = "free")+
  scale_fill_manual(values = cols_phyla_known)+
  ylab("Relative abundance of taxa (at Phylum level)")+
  xlab("Reef sites")+
  theme(axis.text.x = element_text(angle = 75, hjust = 1, size = 12),
        #axis.ticks.x = element_blank(),
        #axis.title.x = element_blank(),
        strip.text = element_text(colour="black", size=12),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        legend.position = "right",
        legend.title = element_blank(),
        legend.text = element_text(size = 12))

# ggarrange(admix.bar_data, admix.bar_data_RA,
#          ncol = 1, nrow = 2)
# Group by mean using R Base aggregate()
phylum_mean_per_trip <- aggregate(taxa.barplots_phylum_known_transposed_melt_RA$value, by=list(taxa.barplots_phylum_known_transposed_melt_RA$variable, taxa.barplots_phylum_known_transposed_melt_RA$Sampling_trip),
                                  FUN=mean)
# Checking that the row sums will be 1
sum(phylum_mean_per_trip$x)
## [1] 4
# It worked

knitr::kable(dcast(phylum_mean_per_trip, Group.1 ~ Group.2, value.var = "x"), caption = "Mean relative abundances at Phylum level, partitioned per trip.")
Mean relative abundances at Phylum level, partitioned per trip.
Group.1 Trip_01_Nov-Dec_2019 Trip_02_January_2020 Trip_03_February_2020 Trip_04_July_2020
Acidobacteria 0.0000002 0.0000789 0.0000600 0.0000691
Actinobacteria 0.0199151 0.0206680 0.0131711 0.0110066
Bacteroidetes 0.0231314 0.0179987 0.0539265 0.0114112
Balneolaeota 0.0030615 0.0018065 0.0023532 0.0006050
Candidatus Kaiserbacteria 0.0000002 0.0000429 0.0000003 0.0000004
Candidatus Marinimicrobia 0.0000211 0.0000002 0.0000003 0.0000004
Candidatus Peregrinibacteria 0.0000188 0.0000392 0.0000170 0.0000012
Candidatus Tectomicrobia 0.0000002 0.0000499 0.0000074 0.0001389
Candidatus Thermoplasmatota 0.0000304 0.0004387 0.0000276 0.0005486
Chlamydiae 0.0000007 0.0000599 0.0000010 0.0000012
Chlorobi 0.0000002 0.0000068 0.0000003 0.0000004
Chloroflexi 0.0000002 0.0000542 0.0000459 0.0000611
Cyanobacteria 0.6667602 0.6833643 0.6592973 0.7095411
Deinococcus-Thermus 0.0000250 0.0000805 0.0000006 0.0000008
Euryarchaeota 0.0004029 0.0004352 0.0004526 0.0004434
Fibrobacteres 0.0000005 0.0000211 0.0000006 0.0000008
Firmicutes 0.0039582 0.0039924 0.0036522 0.0041295
Fusobacteria 0.0001832 0.0002426 0.0001629 0.0000441
Gemmatimonadetes 0.0000005 0.0000207 0.0000223 0.0000008
Lentisphaerae 0.0001383 0.0001333 0.0003590 0.0000166
Nitrospinae 0.0000292 0.0000557 0.0001079 0.0001370
Nitrospirae 0.0000292 0.0000829 0.0000454 0.0000582
Planctomycetes 0.0049462 0.0052827 0.0039829 0.0051175
Proteobacteria 0.2734323 0.2614350 0.2589273 0.2532678
Rhodothermaeota 0.0001171 0.0001941 0.0002318 0.0000412
Spirochaetes 0.0003928 0.0004973 0.0004043 0.0008249
Tenericutes 0.0002031 0.0000010 0.0003262 0.0000016
Thaumarchaeota 0.0006138 0.0006689 0.0005068 0.0006472
Verrucomicrobia 0.0025871 0.0022482 0.0019091 0.0018836
# Before plotting the bar plots, I first need to prepare my objects
counts_collapsed_genera <- left_join(otu_table(megan_genus_abundant_known_phyla_only) %>%
                                          as.data.frame %>%
                                          rownames_to_column("OTU"),
                                        tax_table(megan_genus_abundant_known_phyla_only) %>%
                                          as.data.frame %>% 
                                          rownames_to_column("OTU")) %>% 
  column_to_rownames("OTU")
## Joining with `by = join_by(OTU)`
# Now summarising raw counts at Genus (?) level
# taxa.barplots_top20_genera.sum_1 <- ddply(taxa.barplots_top20_genera, "Rank7", numcolwise(sum))
counts_collapsed_genera.sum <- ddply(counts_collapsed_genera, "Genus", numcolwise(sum)) %>% 
  column_to_rownames("Genus")

# ------------------------------------------- #
# Now finding the top 20 most abundant genera #
# ------------------------------------------- #

# Let's calculate the average Genus counts across all samples (Genus is in rows)
counts_collapsed_genera.sum$avg_value <- rowMeans(counts_collapsed_genera.sum)

# Order rows based on average value in descending order
counts_collapsed_genera.sum <- counts_collapsed_genera.sum[order(counts_collapsed_genera.sum$avg_value, decreasing = TRUE), ]
# Remove the avg_value column now - I don't need it anymore
counts_collapsed_genera.sum$avg_value <- NULL
# Now move the Unknown Genus row at the end
row_index <- which(rownames(counts_collapsed_genera.sum) == "NA")
# Move the identified row to the end of the data frame
counts_collapsed_genera.sum <- rbind(counts_collapsed_genera.sum[-row_index, ],
                                     counts_collapsed_genera.sum[row_index, ]
                                     )

# Now collapsing all values bellow the top 20 most abundant genera into "Other"
counts_collapsed_genera.sum.Others <- counts_collapsed_genera.sum
counts_collapsed_genera.sum.Others_top <- slice(counts_collapsed_genera.sum.Others, 1:20)
counts_collapsed_genera.sum.Others_bottom <- slice(counts_collapsed_genera.sum.Others, 21:n())

# Summarise rows from the 21st onwards into one summary row
summary_row <- colSums(counts_collapsed_genera.sum.Others_bottom[, sapply(counts_collapsed_genera.sum.Others_bottom, is.numeric)]) %>% 
  as.data.frame() %>% 
  t()
rownames(summary_row) <- "Other"

# Combine the top 20 rows with the summary row containing all other taxa (including those with "Unknown Genus")
counts_collapsed_genera.sum_21 <- rbind(counts_collapsed_genera.sum.Others_top, summary_row)

# Ready to compute raw abundances per sample - by dividing cell value with column sum
taxa.barplots_top20_genera.sum_RA <- counts_collapsed_genera.sum_21
for (i in 1:(ncol(counts_collapsed_genera.sum_21))) {
  taxa.barplots_top20_genera.sum_RA[i] <- taxa.barplots_top20_genera.sum_RA[i] / sum(taxa.barplots_top20_genera.sum_RA[i]) 
}
taxa.barplots_top20_genera.sum_RA
# Checking that it sums up to 1
colSums(taxa.barplots_top20_genera.sum_RA)
##          11-049-1_S89_R1          11-049-2_S90_R1          11-049-3_S91_R1 
##                        1                        1                        1 
##          11-049-4_S92_R1          11-162-1_S81_R1          11-162-2_S82_R1 
##                        1                        1                        1 
##          11-162-3_S83_R1          11-162-4_S84_R1           13-124-1_S9_R1 
##                        1                        1                        1 
##          13-124-2_S10_R1          13-124-3_S11_R1          13-124-4_S12_R1 
##                        1                        1                        1 
##          21-550-1_S69_R1          21-550-2_S70_R1          21-550-3_S71_R1 
##                        1                        1                        1 
##          21-550-4_S72_R1          21-580-1_S57_R1          21-580-2_S58_R1 
##                        1                        1                        1 
##          21-580-3_S59_R1          21-580-4_S60_R1          22-084-1_S41_R1 
##                        1                        1                        1 
##          22-084-2_S42_R1          22-084-3_S43_R1          22-084-4_S44_R1 
##                        1                        1                        1 
##      Agincourt1-1_S33_R1      Agincourt1-2_S34_R1      Agincourt1-3_S35_R1 
##                        1                        1                        1 
##      Agincourt1-4_S36_R1       Arlington-1_S37_R1       Arlington-2_S38_R1 
##                        1                        1                        1 
##       Arlington-3_S39_R1       Arlington-4_S40_R1           Boult-1_S25_R1 
##                        1                        1                        1 
##           Boult-2_S26_R1           Boult-3_S27_R1           Boult-4_S28_R1 
##                        1                        1                        1 
##      Broomfield-1_S49_R1      Broomfield-3_S51_R1      Broomfield-4_S52_R1 
##                        1                        1                        1 
## Broomfield-rpt-2_S115_R1       Centipede-1_S57_R1       Centipede-2_S58_R1 
##                        1                        1                        1 
##       Centipede-3_S59_R1       Centipede-4_S60_R1         Chicken-1_S69_R1 
##                        1                        1                        1 
##         Chicken-2_S70_R1         Chicken-3_S71_R1         Chicken-4_S72_R1 
##                        1                        1                        1 
##        Chinaman-1_S65_R1        Chinaman-2_S66_R1        Chinaman-3_S67_R1 
##                        1                        1                        1 
##        Chinaman-4_S68_R1         Corbett-1_S17_R1         Corbett-2_S18_R1 
##                        1                        1                        1 
##         Corbett-3_S19_R1         Corbett-4_S20_R1            Davie-1_S1_R1 
##                        1                        1                        1 
##            Davie-2_S2_R1            Davie-3_S3_R1            Davie-4_S4_R1 
##                        1                        1                        1 
##         Erskine-1_S61_R1         Erskine-2_S62_R1         Erskine-3_S63_R1 
##                        1                        1                        1 
##         Erskine-4_S64_R1         Fairfax-1_S33_R1         Fairfax-2_S34_R1 
##                        1                        1                        1 
##         Fairfax-3_S35_R1         Fairfax-4_S36_R1     Farquaharson-1_S1_R1 
##                        1                        1                        1 
##     Farquaharson-2_S2_R1     Farquaharson-3_S3_R1     Farquaharson-4_S4_R1 
##                        1                        1                        1 
##          Feather-1_S5_R1          Feather-2_S6_R1          Feather-3_S7_R1 
##                        1                        1                        1 
##          Feather-4_S8_R1    Fore-and-Aft-1_S77_R1    Fore-and-Aft-2_S78_R1 
##                        1                        1                        1 
##    Fore-and-Aft-3_S79_R1    Fore-and-Aft-4_S80_R1            Fork-1_S49_R1 
##                        1                        1                        1 
##            Fork-2_S50_R1            Fork-3_S51_R1            Fork-4_S52_R1 
##                        1                        1                        1 
##            Grub-1_S65_R1            Grub-2_S66_R1            Grub-3_S67_R1 
##                        1                        1                        1 
##            Grub-4_S68_R1        Hastings-1_S41_R1        Hastings-2_S42_R1 
##                        1                        1                        1 
##        Hastings-3_S43_R1        Hastings-4_S44_R1          Hedley-1_S21_R1 
##                        1                        1                        1 
##          Hedley-2_S22_R1          Hedley-3_S23_R1           Helix-1_S61_R1 
##                        1                        1                        1 
##           Helix-2_S62_R1           Helix-3_S63_R1           Helix-4_S64_R1 
##                        1                        1                        1 
##          Hoskyn-1_S29_R1          Hoskyn-2_S30_R1          Hoskyn-3_S31_R1 
##                        1                        1                        1 
##          Hoskyn-4_S32_R1      JohnBrewer-1_S93_R1      JohnBrewer-2_S94_R1 
##                        1                        1                        1 
##      JohnBrewer-3_S97_R1      JohnBrewer-4_S98_R1           Kelso-1_S85_R1 
##                        1                        1                        1 
##           Kelso-2_S86_R1           Kelso-3_S87_R1           Kelso-4_S88_R1 
##                        1                        1                        1 
##           Knife-1_S45_R1           Knife-2_S46_R1           Knife-3_S47_R1 
##                        1                        1                        1 
##           Knife-4_S48_R1          Lagoon-1_S13_R1          Lagoon-2_S14_R1 
##                        1                        1                        1 
##          Lagoon-3_S15_R1          Lagoon-4_S16_R1     LittleKelso-1_S81_R1 
##                        1                        1                        1 
##     LittleKelso-2_S82_R1     LittleKelso-3_S83_R1     LittleKelso-4_S84_R1 
##                        1                        1                        1 
##          Lynchs-1_S99_R1         Lynchs-2_S100_R1         Lynchs-3_S101_R1 
##                        1                        1                        1 
##         Lynchs-4_S102_R1          Mantis-1_S85_R1          Mantis-2_S86_R1 
##                        1                        1                        1 
##          Mantis-3_S87_R1          Mantis-4_S88_R1        Masthead-1_S53_R1 
##                        1                        1                        1 
##        Masthead-2_S54_R1        Masthead-3_S55_R1        Masthead-4_S56_R1 
##                        1                        1                        1 
##       McCulloch-1_S17_R1       McCulloch-2_S18_R1       McCulloch-3_S19_R1 
##                        1                        1                        1 
##       McCulloch-4_S20_R1        McSweeney-1_S5_R1        McSweeney-2_S6_R1 
##                        1                        1                        1 
##        McSweeney-3_S7_R1        McSweeney-4_S8_R1         Monsoon-1_S21_R1 
##                        1                        1                        1 
##         Monsoon-2_S22_R1         Monsoon-3_S23_R1         Monsoon-4_S24_R1 
##                        1                        1                        1 
##           Moore-1_S25_R1           Moore-2_S26_R1           Moore-3_S27_R1 
##                        1                        1                        1 
##           Moore-4_S28_R1        Myrmidon-1_S53_R1        Myrmidon-2_S54_R1 
##                        1                        1                        1 
##        Myrmidon-3_S55_R1        Myrmidon-4_S56_R1           North-1_S37_R1 
##                        1                        1                        1 
##           North-2_S38_R1           North-3_S39_R1           North-4_S40_R1 
##                        1                        1                        1 
##           Peart-1_S13_R1           Peart-2_S14_R1           Peart-3_S15_R1 
##                        1                        1                        1 
##           Peart-4_S16_R1             Rib-1_S73_R1             Rib-2_S74_R1 
##                        1                        1                        1 
##             Rib-3_S75_R1             Rib-4_S76_R1        Roxburgh-1_S89_R1 
##                        1                        1                        1 
##        Roxburgh-2_S90_R1        Roxburgh-3_S91_R1        Roxburgh-4_S92_R1 
##                        1                        1                        1 
##        Sanbank1-1_S77_R1        Sanbank1-2_S78_R1        Sanbank1-3_S79_R1 
##                        1                        1                        1 
##        Sanbank1-4_S80_R1     SmallLagoon-1_S45_R1     SmallLagoon-2_S46_R1 
##                        1                        1                        1 
##     SmallLagoon-3_S47_R1     SmallLagoon-4_S48_R1      St-Crispin-1_S73_R1 
##                        1                        1                        1 
##      St-Crispin-2_S74_R1      St-Crispin-3_S75_R1      St-Crispin-4_S76_R1 
##                        1                        1                        1 
##           Taylor-1_S9_R1          Taylor-2_S10_R1          Taylor-3_S11_R1 
##                        1                        1                        1 
##          Taylor-4_S12_R1        Thetford-1_S29_R1        Thetford-2_S30_R1 
##                        1                        1                        1 
##        Thetford-3_S31_R1        Thetford-4_S32_R1 
##                        1                        1
# Now setting row names - RA
taxa.barplots_top20_genera_transposed_RA <- t(taxa.barplots_top20_genera.sum_RA)

# Now setting back the col names before melting - RA
taxa.barplots_top20_genera_transposed_RA <- tibble::rownames_to_column(as.data.frame(taxa.barplots_top20_genera_transposed_RA), "Sample_ID")
taxa.barplots_top20_genera_transposed_melt_RA = reshape2::melt(taxa.barplots_top20_genera_transposed_RA, id.vars=c("Sample_ID"))

# At last, we add metadata_barplots info so that I can make facets in the plot - RA
taxa.barplots_top20_genera_transposed_melt_RA <- left_join(taxa.barplots_top20_genera_transposed_melt_RA,
                                                           metadata %>% 
                                                             rownames_to_column("Sample_ID"))
## Joining with `by = join_by(Sample_ID)`
# Setting colors
cols_top20_genera <- c(
  "Synechococcus" = "seagreen",
  "Candidatus Pelagibacter" = "steelblue4",
  "Prochlorococcus" = "palegreen3",
  "Candidatus Actinomarina" = "powderblue",
  "Candidatus Puniceispirillum" = "seashell2",
  "Cyanobium" = "seagreen2",
  "Marinovum" = "olivedrab1",
  "Luminiphilus" = "maroon",
  "Pseudomonas" = "grey45",
  "Polaribacter" = "salmon",
  "Rhodopirellula" = "tan",
  "Flavobacterium" = "indianred4",
  "Litoricola" = "turquoise",
  "Vibrio" = "indianred3",
  "Balneola" = "seagreen3",
  "Erythrobacter" = "plum4",
  "Pseudoalteromonas" = "slategrey",
  "Sulfitobacter" = "plum",
  "Nisaea" = "steelblue3",
  "Candidatus Endolissoclinum" = "tan2",
  "Other" = "snow3"
)

admix_top20_genera_RA=ggplot(data=taxa.barplots_top20_genera_transposed_melt_RA,
                        aes(x=Sample_ID,
                            y=value,
                            fill=variable))+
  geom_bar(stat = "identity")+
  scale_y_continuous(expand = c(0,0))+
  facet_wrap(~Sampling_trip, scales = "free", nrow = 4)+
#    facet_wrap(~Sampling_trip, scales = "free", ncol = 1, nrow = 4)+
#  facet_grid(~Sampling_trip, scales = "free_x", space = "free")+
  scale_fill_manual(values = cols_top20_genera)+
  ylab("Relative abundances of taxa (at 'Genus' level)")+
  xlab("Reef sites")+
  theme(axis.text.x = element_text(angle = 75, hjust = 1, size = 12),
        #axis.ticks.x = element_blank(),
        #axis.title.x = element_blank(),
        strip.text = element_text(colour="black", size=12),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        legend.position = "right",
        legend.title = element_blank(),
        legend.text = element_text(size = 12))
admix_top20_genera_RA

# Group by mean using R Base aggregate()
genera_mean <- aggregate(taxa.barplots_top20_genera_transposed_melt_RA$value, by=list(taxa.barplots_top20_genera_transposed_melt_RA$variable), FUN=mean)
# Checking that the row sums will be 1
sum(genera_mean$x)
## [1] 1
# It worked

# Printing as table now, by sorting the values too.
knitr::kable(arrange(genera_mean, desc(x)), caption = "Mean relative abundances at Genus level, across all samples. We only show the balue for the top 20 most abundant genera, and values for all others are collapsed within the category Other, shown in grey on the barplots.")
Mean relative abundances at Genus level, across all samples. We only show the balue for the top 20 most abundant genera, and values for all others are collapsed within the category Other, shown in grey on the barplots.
Group.1 x
Synechococcus 0.5498770
Candidatus Pelagibacter 0.1589150
Prochlorococcus 0.1193653
Other 0.1012086
Candidatus Actinomarina 0.0120422
Candidatus Puniceispirillum 0.0090199
Marinovum 0.0074967
Cyanobium 0.0072463
Luminiphilus 0.0053392
Pseudomonas 0.0050729
Polaribacter 0.0048884
Rhodopirellula 0.0029601
Flavobacterium 0.0025491
Vibrio 0.0020786
Litoricola 0.0020743
Pseudoalteromonas 0.0018675
Balneola 0.0018007
Erythrobacter 0.0016710
Sulfitobacter 0.0015728
Nisaea 0.0014830
Candidatus Endolissoclinum 0.0014713
# Group by mean using R Base aggregate()
genera_mean_per_trip <- aggregate(taxa.barplots_top20_genera_transposed_melt_RA$value, by=list(taxa.barplots_top20_genera_transposed_melt_RA$variable, taxa.barplots_top20_genera_transposed_melt_RA$Sampling_trip), FUN=mean)
# Checking that the row sums will be 1
sum(genera_mean_per_trip$x)
## [1] 4
# It worked

knitr::kable(dcast(genera_mean_per_trip, Group.1 ~ Group.2, value.var = "x"), caption = "Mean relative abundances at Genus level, partitioned per trip.")
Mean relative abundances at Genus level, partitioned per trip.
Group.1 Trip_01_Nov-Dec_2019 Trip_02_January_2020 Trip_03_February_2020 Trip_04_July_2020
Synechococcus 0.5949829 0.6450816 0.6313425 0.3702787
Candidatus Pelagibacter 0.1624467 0.1490722 0.1555976 0.1671242
Prochlorococcus 0.0581456 0.0241497 0.0147776 0.3293883
Candidatus Actinomarina 0.0164315 0.0165645 0.0093994 0.0067467
Candidatus Puniceispirillum 0.0076347 0.0093176 0.0108462 0.0084509
Cyanobium 0.0083558 0.0082732 0.0079558 0.0049497
Marinovum 0.0111624 0.0079801 0.0057018 0.0055803
Luminiphilus 0.0070972 0.0052074 0.0066333 0.0030773
Pseudomonas 0.0057212 0.0056568 0.0055251 0.0037157
Polaribacter 0.0007300 0.0005458 0.0199170 0.0003381
Rhodopirellula 0.0033751 0.0031655 0.0015981 0.0035039
Flavobacterium 0.0024805 0.0018970 0.0044455 0.0017056
Litoricola 0.0044207 0.0028318 0.0015289 0.0000004
Vibrio 0.0024719 0.0025171 0.0016892 0.0016926
Balneola 0.0029779 0.0017469 0.0022828 0.0005516
Erythrobacter 0.0013159 0.0032885 0.0015133 0.0006846
Pseudoalteromonas 0.0010396 0.0016676 0.0006369 0.0036343
Sulfitobacter 0.0018072 0.0019539 0.0018984 0.0008121
Nisaea 0.0023232 0.0005941 0.0030482 0.0003829
Candidatus Endolissoclinum 0.0014874 0.0015088 0.0016374 0.0012991
Other 0.1035927 0.1069799 0.1120250 0.0860830
# Subsetting my phyloseq object - I only want Bacteroidetes!
megan_Bacteroidetes <- subset_taxa(megan_genus_abundant, # Phyloseq object with all OTUs
                           Phylum=="Bacteroidetes") # The phyloseq object with raw counts

# Shannon diversity
megan_genus_shannon_Bacteroidetes <- diversity(t(otu_table(megan_Bacteroidetes)), index = "shannon")

# Preparing the objects to visualise as boxplots
# Shannon
megan_genus_shannon_boxplots_Bacteroidetes <- as.data.frame(megan_genus_shannon_Bacteroidetes)
megan_genus_shannon_boxplots_Bacteroidetes <- tibble::rownames_to_column(megan_genus_shannon_boxplots_Bacteroidetes, "Sample_ID")
megan_genus_shannon_boxplots_Bacteroidetes <- left_join(megan_genus_shannon_boxplots_Bacteroidetes,
                                                        metadata %>%
                                                          rownames_to_column("Sample_ID"))

# Getting median and SD: https://stackoverflow.com/questions/13372734/how-to-display-the-median-value-in-a-faceted-boxplot-in-ggplot
Shannon_numerical_summ_Bacteroidetes <- ddply(megan_genus_shannon_boxplots_Bacteroidetes,
                                              .(Sampling_trip),
                                              summarize,
                                              med = median(megan_genus_shannon_Bacteroidetes),
                                              SD = sd(megan_genus_shannon_Bacteroidetes))

# Plotting
ggplot(megan_genus_shannon_boxplots_Bacteroidetes, aes(y = megan_genus_shannon_Bacteroidetes, x = Sampling_trip)) + # Shannon index
  geom_boxplot(aes(fill=factor(Sampling_trip)), outlier.shape = NA) +
  geom_jitter(size=1.2, alpha=0.5) + # adding actual data points
  geom_text(data = Shannon_numerical_summ_Bacteroidetes, aes(y = med, label = round(med,2)),size = 4.5, vjust = -0.5) + # adding median as text
  geom_text(data = Shannon_numerical_summ_Bacteroidetes, aes(y = SD, label = round(SD,2)),size = 4.5, vjust = -0.5) + # adding median as text
  scale_fill_manual(values=c("indianred", # Sampling trip 1
                                "indianred4", # Sampling trip 2 
                                "red3", # Sampling trip 3
                                "slateblue")) + # Sampling trip 4
  coord_flip()+ # just flipping the plot
  labs(title="Alpha diversity - within Bacteroidetes", 
       subtitle="Across sampling trips",
       x="Sampling events",
       y="Shannon index")+
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12)) +
  theme_bw() +
  ylim(1,5) +
  stat_pvalue_manual(megan_genus_shannon_boxplots_Bacteroidetes %>% 
                       pairwise_wilcox_test(megan_genus_shannon_Bacteroidetes ~ Sampling_trip) %>% 
                       add_xy_position())
Bacteroidetes Shannon index

Bacteroidetes Shannon index

# I will modify this manually in Inkscape
Median and standard deviation for Shannon Index values within Bacteroidetes, computed within trips.
Sampling_trip med SD
Trip_01_Nov-Dec_2019 2.696195 0.4270093
Trip_02_January_2020 2.642752 0.3609917
Trip_03_February_2020 2.496523 0.6229173
Trip_04_July_2020 2.237772 0.2510951
# Numerical output
knitr::kable(megan_genus_shannon_boxplots_Bacteroidetes %>% 
                       pairwise_wilcox_test(megan_genus_shannon_Bacteroidetes ~ Sampling_trip),
             caption = "Wilcoxon rank sum test to compare median Shanon Diversity between trips, computed within Bacteroidetes.")
Wilcoxon rank sum test to compare median Shanon Diversity between trips, computed within Bacteroidetes.
.y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
megan_genus_shannon_Bacteroidetes Trip_01_Nov-Dec_2019 Trip_02_January_2020 44 48 1227 1.84e-01 3.68e-01 ns
megan_genus_shannon_Bacteroidetes Trip_01_Nov-Dec_2019 Trip_03_February_2020 44 43 1145 9.20e-02 2.76e-01 ns
megan_genus_shannon_Bacteroidetes Trip_01_Nov-Dec_2019 Trip_04_July_2020 44 56 2049 0.00e+00 1.00e-07 ****
megan_genus_shannon_Bacteroidetes Trip_02_January_2020 Trip_03_February_2020 48 43 1146 3.69e-01 3.69e-01 ns
megan_genus_shannon_Bacteroidetes Trip_02_January_2020 Trip_04_July_2020 48 56 2161 1.00e-07 5.00e-07 ****
megan_genus_shannon_Bacteroidetes Trip_03_February_2020 Trip_04_July_2020 43 56 1519 2.60e-02 1.06e-01 ns

Does diversity differ across trips when computed on overall communities?

# Shannon diversity
megan_genus_shannon <- diversity(t(otu_table(megan_genus_abundant)), index = "shannon")

# Preparing the objects to visualise as boxplots
# Shannon
megan_genus_shannon_boxplots <- as.data.frame(megan_genus_shannon)
megan_genus_shannon_boxplots <- tibble::rownames_to_column(megan_genus_shannon_boxplots, "Sample_ID")
megan_genus_shannon_boxplots <- left_join(megan_genus_shannon_boxplots,
                                          metadata %>% rownames_to_column("Sample_ID"))

# Getting median and SD: https://stackoverflow.com/questions/13372734/how-to-display-the-median-value-in-a-faceted-boxplot-in-ggplot
Shannon_numerical_summ <- ddply(megan_genus_shannon_boxplots,
                                              .(Sampling_trip),
                                              summarize,
                                              med = median(megan_genus_shannon),
                                              SD = sd(megan_genus_shannon))

# Plotting
ggplot(megan_genus_shannon_boxplots, aes(y = megan_genus_shannon, x = Sampling_trip)) + # Shannon index
  geom_boxplot(aes(fill=factor(Sampling_trip)), outlier.shape = NA) +
  geom_jitter(size=1.2, alpha=0.5) + # adding actual data points
  geom_text(data = Shannon_numerical_summ, aes(y = med, label = round(med,2)),size = 4.5, vjust = -0.5) + # adding median as text
  geom_text(data = Shannon_numerical_summ, aes(y = SD, label = round(SD,2)),size = 4.5, vjust = -0.5) + # adding median as text
  scale_fill_manual(values=c("indianred", # Sampling trip 1
                                "indianred4", # Sampling trip 2 
                                "red3", # Sampling trip 3
                                "slateblue")) + # Sampling trip 4
  coord_flip()+ # just flipping the plot
  labs(title="Alpha diversity - overall microbial communities", 
       subtitle="Across sampling trips",
       x="Sampling events",
       y="Shannon index")+
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12)) +
  theme_bw() +
  stat_pvalue_manual(megan_genus_shannon_boxplots %>% 
                       pairwise_wilcox_test(megan_genus_shannon ~ Sampling_trip) %>% 
                       add_xy_position())
Bacteroidetes Shannon index

Bacteroidetes Shannon index

# I will modify this manually in Inkscape
knitr::kable(Shannon_numerical_summ, caption = "Median and standard deviation for Shannon Index values, computed within trips.")
Median and standard deviation for Shannon Index values, computed within trips.
Sampling_trip med SD
Trip_01_Nov-Dec_2019 1.800334 0.4372334
Trip_02_January_2020 1.576837 0.5217831
Trip_03_February_2020 1.803015 0.4967308
Trip_04_July_2020 1.735373 0.2994645
# Numerical output
knitr::kable(megan_genus_shannon_boxplots %>% 
                       pairwise_wilcox_test(megan_genus_shannon ~ Sampling_trip),
             caption = "Wilcoxon rank sum test to compare median Shanon Diversity between trips, computed for overall communities.")
Wilcoxon rank sum test to compare median Shanon Diversity between trips, computed for overall communities.
.y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
megan_genus_shannon Trip_01_Nov-Dec_2019 Trip_02_January_2020 44 48 1256 0.119 0.714 ns
megan_genus_shannon Trip_01_Nov-Dec_2019 Trip_03_February_2020 44 43 954 0.949 1.000 ns
megan_genus_shannon Trip_01_Nov-Dec_2019 Trip_04_July_2020 44 56 1379 0.309 1.000 ns
megan_genus_shannon Trip_02_January_2020 Trip_03_February_2020 48 43 911 0.340 1.000 ns
megan_genus_shannon Trip_02_January_2020 Trip_04_July_2020 48 56 1197 0.339 1.000 ns
megan_genus_shannon Trip_03_February_2020 Trip_04_July_2020 43 56 1378 0.221 1.000 ns
go.anosim <- left_join(otu_table(megan_GO_5_RA_no_rare) %>%
                         as.data.frame %>%
                         rownames_to_column("OTU"),
                         megan_GO_5_FUN %>% 
                         rownames_to_column("OTU")) %>% 
  unite(taxonomy, c(OTU, Rank1, Rank2, Rank3, Rank4, Rank5, Rank6#, Rank7, Rank8
                    ), sep = "; ") %>% 
  column_to_rownames("taxonomy")
# Removing rows with NAs, because ANOSIM does not take in missing vals
go.anosim <- na.omit(go.anosim)

# Object is ready to perform the test
ano_go <- anosim(t(go.anosim), 
                   sample_data(megan_GO_5_RA_no_rare)$Sampling_trip, 
                   distance = "bray", 
                   permutations = 9999)
# Results
ano_go
## 
## Call:
## anosim(x = t(go.anosim), grouping = sample_data(megan_GO_5_RA_no_rare)$Sampling_trip,      permutations = 9999, distance = "bray") 
## Dissimilarity: bray 
## 
## ANOSIM statistic R: 0.3743 
##       Significance: 1e-04 
## 
## Permutation: free
## Number of permutations: 9999

Pairwise PERMANOVA - GO terms (rank 5)

Integrating microbial and environmental data

Partial Mantel tests

# Compute the mantel tests - cite the source of where this is coming from!
multimantel<-function(distance,env.df,geo.dist){
  BCdist<-distance
  statistic<-NULL
  pval<-NULL
  n.obs<-NULL
  for (i in 1:ncol(env.df)){
    na.pos<-which(is.na(env.df[,i]))
    if (length(na.pos)>0) tmp<-mantel.partial(as.dist(as.matrix(BCdist)[-c(na.pos),-c(na.pos)]),dist(env.df[-c(na.pos),i]),as.dist(as.matrix(geo.dist)[-c(na.pos),-c(na.pos)]),method = "pearson",permutations = 1000) else tmp<-mantel.partial(BCdist,dist(env.df[,i]),geo.dist,method = "pearson",permutations = 1000)
    statistic<-c(statistic,tmp$statistic)
    pval<-c(pval,tmp$signif)
    n.obs<-c(n.obs,nrow(env.df)-length(na.pos))
  }
  data.frame(var=colnames(env.df),statistic,pval,p.corr=p.adjust(pval,method="bonferroni"),n.obs)
}

### Calculate Bray-Curtis dissimilarities - doing this on the Relative abundance data when rare taxa were excluded  
# Taxonomy
megan_genus_dist <- vegdist(t(otu_table(megan_genus_RA_no_rare)), method = "bray")
# GO terms 
megan_go_dist <- vegdist(t(otu_table(megan_GO_5_RA_no_rare)), method = "bray")

# Getting distances (in km) for IMOS-MGD sites - this is important because the Mantels will be corrected for geography
# Getting distances (in km) for IMOS-MGD sites
metadata_Mantel <- sample_data(megan_genus_clr) %>% 
  as.matrix() %>% 
  as.data.frame() %>% 
  rownames_to_column("Sample_ID")

# Importing the coordinates
map_coords_Mantel <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/input_files/MARKO_for_eReefs_Lats_Longs.csv")
map_coords_Mantel <- left_join(metadata_Mantel[,c(1,2)], map_coords_Mantel, by = c("REEF_NAME" = "name"))
# map_coords <- map_coords %>% remove_rownames %>% column_to_rownames(var="Sample_ID")
map_coords_Mantel$REEF_NAME <- NULL
names(map_coords_Mantel)[names(map_coords_Mantel) == 'Sample_ID'] <- 'name'
# Setting first column as row names
map_coords_Mantel <- map_coords_Mantel %>%
  remove_rownames %>%
  column_to_rownames(var="name")
# Need to reorder as pointDistance() function requires longitude to go first
map_coords_reorder <- map_coords_Mantel %>% 
  relocate(lon, lat)

# Probably better to compute this 4 times for each of the trips, but first need to make sure that this code works
IMOS_mar.dist.mat <- round(pointDistance(map_coords_reorder, lonlat=TRUE) / 1000)
rownames(IMOS_mar.dist.mat) <- metadata_Mantel$Sample_ID
colnames(IMOS_mar.dist.mat) <- metadata_Mantel$Sample_ID

# Trick here: Now adding one column to the front so that I can make the correlation plot for both midshelf and offshore reefs
metadata_Mantel <- cbind(a = 0, metadata_Mantel)

# partial Mantels - microbial taxa
partial_Mantel_taxa_res <- multimantel(as.dist(as.matrix(megan_genus_dist)[metadata_Mantel$a=="0",metadata_Mantel$a=="0"]), # Distance object, doing it only
                                 # for the epipelagic layer
                                 metadata_Mantel[metadata_Mantel$a=="0", colnames(metadata_Mantel[,c(26:42)])], # columns 26-42 will extract numerical values
                                 as.dist(as.matrix(IMOS_mar.dist.mat)[metadata_Mantel$a=="0", metadata_Mantel$a=="0"])) #[env.mat$epi=="EPI", env.mat$epi=="EPI"])) # I only need the geographic 
# distances, in km
knitr::kable(partial_Mantel_taxa_res %>% arrange(abs(statistic)),
             caption = "Partial Mantel tests assessing which physico-chemical parameters mat act as significant drivers of seawater microbiomes at the taxonomic level."
               )
Partial Mantel tests assessing which physico-chemical parameters mat act as significant drivers of seawater microbiomes at the taxonomic level.
var statistic pval p.corr n.obs
FLUORESCENCE_2.5m_RV 0.0106918 0.3536464 1.0000000 191
median_TDN_µM -0.0160852 0.6793207 1.0000000 191
median_NO3_µM 0.0175296 0.2907093 1.0000000 191
median_DOC_µM -0.0227709 0.8021978 1.0000000 191
median_Si_µM 0.0237783 0.1858142 1.0000000 191
SALINITY_2.5m_RV -0.0350001 0.8911089 1.0000000 191
median_TSS_mg_L -0.0429887 0.8241758 1.0000000 187
median_Chlorophyll_A_µg_L 0.0573543 0.0989011 1.0000000 191
median_Phaeophytin_A_µg_L 0.0630440 0.0799201 1.0000000 191
median_NO2_µM 0.1063563 0.0009990 0.0169830 191
median_NH4_µM 0.1074423 0.0079920 0.1358641 191
median_PP_µM 0.1322265 0.0019980 0.0339660 187
median_TDP_µM 0.2247735 0.0009990 0.0169830 191
median_POC_µM 0.2335920 0.0009990 0.0169830 191
median_PN_µM 0.2648581 0.0009990 0.0169830 191
SEAWATER_TEMPERATURE_2.5m_RV 0.2957969 0.0009990 0.0169830 191
median_PO4_µM 0.3630517 0.0009990 0.0169830 191
# partial Mantels - microbial function (GO terms)
partial_Mantel_GOs_res <- multimantel(as.dist(as.matrix(megan_go_dist)[metadata_Mantel$a=="0",metadata_Mantel$a=="0"]), # Distance object, doing it only
                                 # for the epipelagic layer
                                 metadata_Mantel[metadata_Mantel$a=="0", colnames(metadata_Mantel[,c(26:42)])], # columns 26-42 will extract numerical values
                                 as.dist(as.matrix(IMOS_mar.dist.mat)[metadata_Mantel$a=="0", metadata_Mantel$a=="0"])) #[env.mat$epi=="EPI", env.mat$epi=="EPI"])) # I only need the geographic 
# distances, in km
knitr::kable(partial_Mantel_GOs_res %>% arrange(abs(statistic)),
             caption = "Partial Mantel tests assessing which physico-chemical parameters mat act as significant drivers of seawater microbiomes at the functional level."
               )
Partial Mantel tests assessing which physico-chemical parameters mat act as significant drivers of seawater microbiomes at the functional level.
var statistic pval p.corr n.obs
median_Si_µM 0.0111393 0.3076923 1.0000000 191
SALINITY_2.5m_RV -0.0130414 0.6993007 1.0000000 191
median_TSS_mg_L -0.0242762 0.7142857 1.0000000 187
median_TDN_µM 0.0293237 0.1438561 1.0000000 191
median_NO2_µM 0.0304584 0.1388611 1.0000000 191
median_NO3_µM -0.0370669 0.8851149 1.0000000 191
median_NH4_µM 0.0511959 0.0459540 0.7812188 191
FLUORESCENCE_2.5m_RV 0.0532699 0.0279720 0.4755245 191
median_Phaeophytin_A_µg_L 0.0930029 0.0069930 0.1188811 191
median_Chlorophyll_A_µg_L 0.1126347 0.0019980 0.0339660 191
median_DOC_µM 0.1157628 0.0009990 0.0169830 191
median_PP_µM 0.1189701 0.0009990 0.0169830 187
median_PN_µM 0.2554285 0.0009990 0.0169830 191
median_TDP_µM 0.2583475 0.0009990 0.0169830 191
median_PO4_µM 0.2640840 0.0009990 0.0169830 191
median_POC_µM 0.2755923 0.0009990 0.0169830 191
SEAWATER_TEMPERATURE_2.5m_RV 0.2990635 0.0009990 0.0169830 191
# WQ
partial_Mantel_cor.mat_taxa_WQ <- data.frame(Taxonomy=partial_Mantel_taxa_res$statistic,
#                            GO_terms=IMOS_res_go_WQ$statistic, # Transcriptome=res.metaT$statistic,
                            row.names = partial_Mantel_taxa_res$var)

partial_Mantel_pcor.mat_taxa_WQ <- data.frame(Taxonomy=partial_Mantel_taxa_res$pval,
#                             GO_terms=IMOS_res_go_WQ$p.corr, # Expression=res.exp$pval,
                             row.names = partial_Mantel_taxa_res$var)# ,Transcriptome=res.metaT$pval)
# Ordering - highest correlations first
# WQ_ordre<-order(apply(IMOS_cor.mat_WQ[,1:2],1,mean),decreasing = T)

# WQ
partial_Mantel_cor.mat_GOs_WQ <- data.frame(Functions=partial_Mantel_GOs_res$statistic,
                            #                            GO_terms=IMOS_res_go_WQ$statistic, # Transcriptome=res.metaT$statistic,
                            row.names = partial_Mantel_GOs_res$var)

partial_Mantel_pcor.mat_GOs_WQ <- data.frame(Functions=partial_Mantel_GOs_res$pval,
                             #                             GO_terms=IMOS_res_go_WQ$p.corr, # Expression=res.exp$pval,
                             row.names = partial_Mantel_GOs_res$var)# ,Transcriptome=res.metaT$pval)
# Ordering - highest correlations first
# WQ_ordre<-order(apply(IMOS_cor.mat_WQ[,1:2],1,mean),decreasing = T)

# Let's visualise this! as heatmaps:
# Taxonomy
heatmap_partial_Mantels_taxa_WQ <- ggcorrplot(partial_Mantel_cor.mat_taxa_WQ,#[ordre,], # Strongest drivers first
              p.mat=partial_Mantel_pcor.mat_taxa_WQ,#[ordre,], # Strongest drivers first
              insig = "blank",
              sig.level = 0.05,
              method = "square",
              lab=T,
              lab_size = 2.5,
              colors=c("#2874b2","white","#ba2832"))
heatmap_partial_Mantels_taxa_WQ

# Functions
heatmap_partial_Mantels_GOs_WQ <- ggcorrplot(partial_Mantel_cor.mat_GOs_WQ,#[ordre,], # Strongest drivers first
              p.mat=partial_Mantel_pcor.mat_GOs_WQ,#[ordre,], # Strongest drivers first
              insig = "blank",
              sig.level = 0.05,
              method = "square",
              lab=T,
              lab_size = 2.5,
              colors=c("#2874b2","white","#ba2832"))
heatmap_partial_Mantels_GOs_WQ

# Merging the two
# patchwork::wrap_plots(heatmap_partial_Mantels_taxa_WQ,
#                      heatmap_partial_Mantels_GOs_WQ,
#                      nrow = 2,
#                      ncol = 1)

MINT sPLS

To (1) identify stable microbial indicators—both taxonomic and functional—that consistently respond to specific physico-chemical variables (e.g., nutrient loads, temperature, salinity) across broad spatio-temporal scales in the GBRwe extended a Sparse Partial Least Squares analysis (sPLS, see Lê Cao et al. 2008, 2009) widely used in microbial oceanography to correlate microbial data with continuous environmental metrics (see e.g. Guidi et al. 2016; Jameson et al. 2023; Priest et al. 2023) with a Multivariate INTegrative method (MINT, see Rohart et al., 2017a) to integrate data from four independent sampling trips We also attempted MINT (Multivariate INTegration, Rohart et al. (2017b)), a method based on multi-group PLS that includes information about samples belonging to independent groups or studies (Eslami et al., 2014). In this context, the challenge was to accommodate for confounding effects between season and geography as each site was sampled only once in time and space. By using MINT sPLS, we aimed to identify microbial indicator taxa and genes that correlate to water chemistry metrics and are shared across the four sampling transects, regardless of geography or season. Similar to sPLS, in MINT sPLS we retained two dimensions and 50 features (microbial taxa or genes) per dimension for the X datasets, and all WQ metrics for the Y dataset.

But the MINT sPLS sample plot and circle correlation plots can be combined into a biplot, which will present both types of information.

Bellow is the code from Kim-Anh to create a MINT sPLS biplot:

# create MINT sPLS object

data(stemcells)

# for the purpose of this example, we artificially
# create a continuous response Y by taking genes 1:10.

X = stemcells$gene[,-c(1:10)]
# renaming columns here so that I can identify the X and Y datasets
colnames(X) = paste('X', 1:ncol(X), sep = '.')

Y = stemcells$gene[,1:10]
# renaming columns here so that I can identify the X and Y datasets
colnames(Y) = paste('Y', 1:ncol(Y), sep = '.')


# here selecting only on X
res = mint.spls(X = X, Y = Y, ncomp = 2,
                keepX = c(10, 5), study = stemcells$study)

plotIndiv(res) # symbol represent study
plotVar(res)


library(ggrepel)

# INPUT ARGUMENTS
col = res$study  # color of samples according to the study
pch = res$study  # pch of samples according to the study
var.names.col = 'grey40'
var.names.size = 4
var.arrow.col.X = 'lightblue' # color of arrow + name for X data set (could be set as a vector length the number of variables selected in X)
var.arrow.col.Y = 'orange'
var.arrow.size = 0.5
var.arrow.length = 0.2
# components to be plotted
comp1 = 1
comp2 = 2
# input the MINT res object
object <- res
comp <- object$ncomp



## --- code starts here --------
# identify variables selected
selection.X <- rowSums(object$loadings$X[, 1:comp]) != 0 
selection.Y <- rowSums(object$loadings$Y[, 1:comp]) != 0 
loadings.X <- data.frame(object$loadings$X[selection.X, ])
loadings.Y <- data.frame(object$loadings$Y[selection.Y, ])

# if cutoff for the correlation circle plot (not used here, if you do I think things will break!)
cutoff <- 0
cors.X <- cor(object$X[, selection.X], object$variates$X[, 1:comp], use = 'pairwise' )
cors.Y <- cor(object$Y[, selection.Y], object$variates$Y[, 1:comp], use = 'pairwise' )
above.cutoff.X <- apply(cors.X, 1, function(x) any(abs(x) >= cutoff))
above.cutoff.Y <- apply(cors.Y, 1, function(x) any(abs(x) >= cutoff))
loadings.X <- loadings.X[above.cutoff.X,]
loadings.Y <- loadings.Y[above.cutoff.Y,]

# only representing the samples in the X space
variates <- object$variates$X
variates <- data.frame(variates)
## scaler of var vs sample coordinates
scaler <- max(variates, na.rm = TRUE)/max(abs(cors.X), na.rm = TRUE)
## potentially need to extend this for the Y scaler??

axes.titles <- c('Comp 1', 'Comp 2')

## ------------- outline of plot -----
gg_biplot <- 
  ggplot() + 
  theme_classic() +  
  labs(x = axes.titles[1], 
       y = axes.titles[2])
# ## vline and hline - you may want to comment this if you dont want these lines
gg_biplot <- gg_biplot + geom_vline(xintercept = 0, size = 0.3, col = 'grey75')
gg_biplot <- gg_biplot +  geom_hline(yintercept = 0, size = 0.3, col = 'grey75')
gg_biplot
# ------


# PLOT SAMPLES
    gg_biplot <- gg_biplot + 
      geom_point(aes(x = variates[, comp1], 
                     y = variates[, comp2], col = col, shape = pch),
                 size = 2,
                 show.legend = FALSE)
    gg_biplot

  
# PLOT VARIABLES
    # the correlations are rescaled - need to fiddle a bit here
    cors.X <- cors.X*scaler*0.8
    cors.Y <- cors.Y*scaler*0.8

      ## lines and arrows
    # X variables
      gg_biplot <-
        gg_biplot + geom_segment(
          aes(
            x = 0,
            y = 0,
            xend = cors.X[,comp1],
            yend = cors.X[,comp2],
          ),
          col = var.arrow.col.X,
          arrow = arrow(length = unit(var.arrow.length, "cm")),
          size = var.arrow.size,
          show.legend = FALSE
        )

      gg_biplot
      
      
      # Y variables
      gg_biplot <-
        gg_biplot + geom_segment(
          aes(
            x = 0,
            y = 0,
            xend = cors.Y[,comp1],
            yend = cors.Y[,comp2],
          ),
          col = var.arrow.col.Y,
          arrow = arrow(length = unit(var.arrow.length, "cm")),
          size = var.arrow.size,
          show.legend = FALSE
        )
      
      gg_biplot

    
    ## labels of X variables
    var.labels.X <- rownames(loadings.X) 
    gg_biplot <-
      gg_biplot + geom_text_repel(
        aes(
          x = cors.X[, comp1],
          y = cors.X[, comp2],
          label = var.labels.X),
        col = var.arrow.col.X)
    
    ## labels of Y variables
    var.labels.Y <- rownames(loadings.Y) 
    gg_biplot <-
      gg_biplot + geom_text_repel(
        aes(
          x = cors.Y[, comp1],
          y = cors.Y[, comp2],
          label = var.labels.Y),
        col = var.arrow.col.Y)

  gg_biplot

Which I applied on our data, first on taxa:

# renaming columns here so that I can identify the X and Y datasets
metadata_MINT_biplot <- sample_data(megan_genus_clr)[,24:40]

# Let's make the names nicer for plotting:
old_names <- c("median_Chlorophyll_A_µg_L", "median_Phaeophytin_A_µg_L", 
               "median_PN_µM", "median_POC_µM", "median_PP_µM", 
               "median_DOC_µM", "median_PO4_µM", "median_NH4_µM", 
               "median_NO2_µM", "median_NO3_µM", "median_Si_µM", 
               "median_TDN_µM", "median_TDP_µM", "median_TSS_mg_L", 
               "SEAWATER_TEMPERATURE_2.5m_RV", "SALINITY_2.5m_RV", 
               "FLUORESCENCE_2.5m_RV")

new_names <- c("Ch-a", "Phaeo", "PN", "POC", "PP", 
               "DOC", "PO4", "NH4", "NO2", "NO3", "Si", "TDN", 
               "TDP", "TSS", "SST_2.5m_RV", "SALINITY_2.5m_RV", 
               "FLUORESCENCE_2.5m_RV")

# Find indices of old names in current column names
indices <- match(old_names, colnames(metadata_MINT_biplot))

# Replace old names with new names
colnames(metadata_MINT_biplot)[indices] <- new_names

# here selecting only on X
res = mint.spls(X = OTUs_biplot_names,
                Y = metadata_MINT_biplot,
                ncomp = 2,
                keepX = c(10, 10),
                study = sample_data(megan_genus_clr)$Sampling_trip)

plotIndiv(res)
plotIndiv(res,
          group = res$study,
#          title = 'global MINT sPLS | Microbial Taxonomy-WQ',
          legend = T,
          rep.space = "XY-variate",
          col.per.group =c("indianred", # Sampling trip 1
                "indianred4", # Sampling trip 2 
                "red3", # Sampling trip 3
                "slateblue"), # Sampling trip 4
#          legend.title = 'Sampling Trip'
          ) # symbol represent study
plotVar(res)
library(ggrepel)

# INPUT ARGUMENTS
col = res$study  # color of samples according to the study
pch = res$study  # pch of samples according to the study
var.names.col = 'grey40'
var.names.size = 4
var.arrow.col.X = 'lightblue' # color of arrow + name for X data set (could be set as a vector length the number of variables selected in X)
var.arrow.col.Y = 'orange'
var.arrow.size = 0.5
var.arrow.length = 0.2
# components to be plotted
comp1 = 1
comp2 = 2
# input the MINT res object
object <- res
comp <- object$ncomp

## --- code starts here --------
# identify variables selected
selection.X <- rowSums(object$loadings$X[, 1:comp]) != 0 
selection.Y <- rowSums(object$loadings$Y[, 1:comp]) != 0 
loadings.X <- data.frame(object$loadings$X[selection.X, ])
loadings.Y <- data.frame(object$loadings$Y[selection.Y, ])

# if cutoff for the correlation circle plot (not used here, if you do I think things will break!)
cutoff <- 0
cors.X <- cor(object$X[, selection.X], object$variates$X[, 1:comp], use = 'pairwise' )
cors.Y <- cor(object$Y[, selection.Y], object$variates$Y[, 1:comp], use = 'pairwise' )
above.cutoff.X <- apply(cors.X, 1, function(x) any(abs(x) >= cutoff))
above.cutoff.Y <- apply(cors.Y, 1, function(x) any(abs(x) >= cutoff))
loadings.X <- loadings.X[above.cutoff.X,]
loadings.Y <- loadings.Y[above.cutoff.Y,]

# only representing the samples in the X space
variates <- object$variates$X
variates <- data.frame(variates)

## scaler of var vs sample coordinates - added one for each data set
scaler.X <- max(object$variates$X, na.rm = TRUE)/max(abs(cors.X), na.rm = TRUE)
scaler.Y <- max(object$variates$Y, na.rm = TRUE)/max(abs(cors.Y), na.rm = TRUE)

axes.titles <- c('Comp 1', 'Comp 2')

## ------------- outline of plot -----
gg_biplot <- 
  ggplot() + 
  theme_classic() +  
  labs(x = axes.titles[1], 
       y = axes.titles[2])
# ## vline and hline - you may want to comment this if you dont want these lines
gg_biplot <- gg_biplot + geom_vline(xintercept = 0, size = 0.3, col = 'grey75')
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
gg_biplot <- gg_biplot +  geom_hline(yintercept = 0, size = 0.3, col = 'grey75')
gg_biplot
# ------


# PLOT SAMPLES
    gg_biplot <- gg_biplot + 
      geom_point(aes(x = variates[, comp1], 
                     y = variates[, comp2], col = col, shape = pch),
                 size = 2,
                 show.legend = FALSE) +
  scale_color_manual(values = c("indianred", # Sampling trip 1
                "indianred4", # Sampling trip 2 
                "red3", # Sampling trip 3
                "slateblue") # Sampling trip 4
                )
    gg_biplot
# PLOT VARIABLES
    # the correlations are rescaled - need to fiddle a bit here
    cors.X <- cors.X*scaler.X*0.7
    cors.Y <- cors.Y*scaler.Y*0.8

      ## lines and arrows
    # X variables
      gg_biplot <-
        gg_biplot + geom_segment(
          aes(
            x = 0,
            y = 0,
            xend = cors.X[,comp1],
            yend = cors.X[,comp2],
          ),
          col = var.arrow.col.X,
          arrow = arrow(length = unit(var.arrow.length, "cm")),
          size = var.arrow.size,
          show.legend = FALSE
        )

      gg_biplot
      # Y variables
      gg_biplot <-
        gg_biplot + geom_segment(
          aes(
            x = 0,
            y = 0,
            xend = cors.Y[,comp1],
            yend = cors.Y[,comp2],
          ),
          col = var.arrow.col.Y,
          arrow = arrow(length = unit(var.arrow.length, "cm")),
          size = var.arrow.size,
          show.legend = FALSE
        )
      
      gg_biplot
    ## labels of X variables
    var.labels.X <- rownames(loadings.X) 
    gg_biplot <-
      gg_biplot + geom_text_repel(
        aes(
          x = cors.X[, comp1],
          y = cors.X[, comp2],
          label = var.labels.X),
        col = var.arrow.col.X)
    
    ## labels of Y variables
    var.labels.Y <- rownames(loadings.Y) 
    gg_biplot <-
      gg_biplot + geom_text_repel(
        aes(
          x = cors.Y[, comp1],
          y = cors.Y[, comp2],
          label = var.labels.Y),
        col = var.arrow.col.Y)
MINT_sPLS_taxa_biplot <-  gg_biplot
MINT_sPLS_taxa_biplot

And then also on functions:

# Plotting immediately here, all my objects are prepared already:
res = mint.spls(X = GOs_biplot_names,
                Y = metadata_MINT_biplot,
                ncomp = 2,
                keepX = c(10, 10),
                study = sample_data(megan_go_clr_5)$Sampling_trip)

plotIndiv(res)
plotIndiv(res,
          group = res$study,
#          title = 'global MINT sPLS | Microbial Function-WQ',
          legend = T,
          rep.space = "XY-variate",
          col.per.group =c("indianred", # Sampling trip 1
                "indianred4", # Sampling trip 2 
                "red3", # Sampling trip 3
                "slateblue"), # Sampling trip 4
#          legend.title = 'Sampling Trip'
          ) # symbol represent study
plotVar(res)
library(ggrepel)

# INPUT ARGUMENTS
col = res$study  # color of samples according to the study
pch = res$study  # pch of samples according to the study
var.names.col = 'grey40'
var.names.size = 4
var.arrow.col.X = 'lightblue' # color of arrow + name for X data set (could be set as a vector length the number of variables selected in X)
var.arrow.col.Y = 'orange'
var.arrow.size = 0.5
var.arrow.length = 0.2
# components to be plotted
comp1 = 1
comp2 = 2
# input the MINT res object
object <- res
comp <- object$ncomp


## --- code starts here --------
# identify variables selected
selection.X <- rowSums(object$loadings$X[, 1:comp]) != 0 
selection.Y <- rowSums(object$loadings$Y[, 1:comp]) != 0 
loadings.X <- data.frame(object$loadings$X[selection.X, ])
loadings.Y <- data.frame(object$loadings$Y[selection.Y, ])

# if cutoff for the correlation circle plot (not used here, if you do I think things will break!)
cutoff <- 0
cors.X <- cor(object$X[, selection.X], object$variates$X[, 1:comp], use = 'pairwise' )
cors.Y <- cor(object$Y[, selection.Y], object$variates$Y[, 1:comp], use = 'pairwise' )
above.cutoff.X <- apply(cors.X, 1, function(x) any(abs(x) >= cutoff))
above.cutoff.Y <- apply(cors.Y, 1, function(x) any(abs(x) >= cutoff))
loadings.X <- loadings.X[above.cutoff.X,]
loadings.Y <- loadings.Y[above.cutoff.Y,]

# only representing the samples in the X space
variates <- object$variates$X
variates <- data.frame(variates)

## scaler of var vs sample coordinates - added one for each data set
scaler.X <- max(object$variates$X, na.rm = TRUE)/max(abs(cors.X), na.rm = TRUE)
scaler.Y <- max(object$variates$Y, na.rm = TRUE)/max(abs(cors.Y), na.rm = TRUE)

axes.titles <- c('Comp 1', 'Comp 2')

## ------------- outline of plot -----
gg_biplot <- 
  ggplot() + 
  theme_classic() +  
  labs(x = axes.titles[1], 
       y = axes.titles[2])
# ## vline and hline - you may want to comment this if you dont want these lines
gg_biplot <- gg_biplot + geom_vline(xintercept = 0, size = 0.3, col = 'grey75')
gg_biplot <- gg_biplot +  geom_hline(yintercept = 0, size = 0.3, col = 'grey75')
gg_biplot
# ------


# PLOT SAMPLES
    gg_biplot <- gg_biplot + 
      geom_point(aes(x = variates[, comp1], 
                     y = variates[, comp2], col = col, shape = pch),
                 size = 2,
                 show.legend = FALSE) +
  scale_color_manual(values = c("indianred", # Sampling trip 1
                "indianred4", # Sampling trip 2 
                "red3", # Sampling trip 3
                "slateblue") # Sampling trip 4
                )
    gg_biplot
# PLOT VARIABLES
    # the correlations are rescaled - need to fiddle a bit here
    cors.X <- cors.X*scaler.X*0.7
    cors.Y <- cors.Y*scaler.Y*0.8

      ## lines and arrows
    # X variables
      gg_biplot <-
        gg_biplot + geom_segment(
          aes(
            x = 0,
            y = 0,
            xend = cors.X[,comp1],
            yend = cors.X[,comp2],
          ),
          col = var.arrow.col.X,
          arrow = arrow(length = unit(var.arrow.length, "cm")),
          size = var.arrow.size,
          show.legend = FALSE
        )

      gg_biplot
      # Y variables
      gg_biplot <-
        gg_biplot + geom_segment(
          aes(
            x = 0,
            y = 0,
            xend = cors.Y[,comp1],
            yend = cors.Y[,comp2],
          ),
          col = var.arrow.col.Y,
          arrow = arrow(length = unit(var.arrow.length, "cm")),
          size = var.arrow.size,
          show.legend = FALSE
        )
      
      gg_biplot
    ## labels of X variables
    var.labels.X <- rownames(loadings.X) 
    gg_biplot <-
      gg_biplot + geom_text_repel(
        aes(
          x = cors.X[, comp1],
          y = cors.X[, comp2],
          label = var.labels.X),
        col = var.arrow.col.X)
    
    ## labels of Y variables
    var.labels.Y <- rownames(loadings.Y) 
    gg_biplot <-
      gg_biplot + geom_text_repel(
        aes(
          x = cors.Y[, comp1],
          y = cors.Y[, comp2],
          label = var.labels.Y),
        col = var.arrow.col.Y)
MINT_sPLS_GOs_biplot <- gg_biplot
MINT_sPLS_GOs_biplot
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

But let’s visualise these correlations as a heatmap:

# I want longer names, not just family and genus:
MINT_sPLS_ind_names_cim <- left_join(otu_table(megan_genus_clr) %>%
                              as.data.frame %>%
                              rownames_to_column("OTU"),
                            tax_table(megan_genus_clr) %>%
                              as.data.frame %>%
                              rownames_to_column("OTU")) %>%
  unite(taxonomy, c(Class, Order, Family, Genus, Species), sep = "; ") # Adding Taxonomy info
## Joining, by = "OTU"
MINT_sPLS_ind_names_cim <- as.character(MINT_sPLS_ind_names_cim$taxonomy)

# I am just making this object (cim_mint.spls2.WQ.taxa.OTUs) to merge with the stability scores based on OTU IDs
cim_mint.spls2.WQ.taxa.OTUs <- cim(mint.spls2.WQ.taxa,
    comp = 1:2,
    xlab = "WQ parameters",
    ylab = "Indicator microbial taxa",
    margins = c(19, # bottom
                28), # right
#    row.names = MINT_sPLS_ind_names_cim,
    symkey = FALSE,
    keysize = c(1, 0.4),
    title = "MINT sPLS Taxa/WQ (PCs 1 and 2)")
MINT sPLS - Taxa/WQ. Microbial taxa that were identified as indicators are shown on the y axis, while WQ measurements are shown on the x axis. These molecular signatures are shared across the four sampling transects. The scale shows similarity values (partial correlations) between the X and Y variables selected across the first two MINT sPLS dimensions, and clustered with a complete Euclidean distance method. The color indicates either positive (red) or negative (blue) correlation.

MINT sPLS - Taxa/WQ. Microbial taxa that were identified as indicators are shown on the y axis, while WQ measurements are shown on the x axis. These molecular signatures are shared across the four sampling transects. The scale shows similarity values (partial correlations) between the X and Y variables selected across the first two MINT sPLS dimensions, and clustered with a complete Euclidean distance method. The color indicates either positive (red) or negative (blue) correlation.

# I want longer names, not just family and genus:
MINT_sPLS_GOs_ind_names_cim <- left_join(otu_table(megan_go_clr_5) %>%
                              as.data.frame %>%
                              rownames_to_column("OTU"),
                            tax_table(megan_go_clr_5) %>%
                              as.data.frame %>%
                              rownames_to_column("OTU")) %>%
  unite(Functions, c(Rank3, Rank4, Rank5, Rank6), sep = "; ") # Adding Taxonomy info
## Joining, by = "OTU"
MINT_sPLS_GOs_ind_names_cim <- as.character(MINT_sPLS_GOs_ind_names_cim$Functions)

# I am just making this object (cim_mint.spls2.WQ.GOs) to merge with the stability scores based on OTU IDs
cim_mint.spls2.WQ.GOs <- cim(mint.spls2.WQ.GOs,
    comp = 1:2,
    xlab = "WQ parameters",
    ylab = "Indicator microbial GO terms (genes/functions)",
    margins = c(19, # bottom
                50), # right
#    row.names = MINT_sPLS_GOs_ind_names_cim,
    symkey = FALSE,
    keysize = c(1, 0.4),
    title = "MINT sPLS GOs/WQ (PCs 1 and 2)")
MINT sPLS - GOs/WQ. Microbial GO terms (genes and functions) that were identified as indicators are shown on the y axis, while WQ measurements are shown on the x axis. These molecular signatures are shared across the four sampling transects. The scale shows similarity values (partial correlations) between the X and Y variables selected across the first two sPLS dimensions, and clustered with a complete Euclidean distance method. The color indicates either positive (red) or negative (blue) correlation.

MINT sPLS - GOs/WQ. Microbial GO terms (genes and functions) that were identified as indicators are shown on the y axis, while WQ measurements are shown on the x axis. These molecular signatures are shared across the four sampling transects. The scale shows similarity values (partial correlations) between the X and Y variables selected across the first two sPLS dimensions, and clustered with a complete Euclidean distance method. The color indicates either positive (red) or negative (blue) correlation.

Which of these associations are stable across sampling trips? Leave-One-Group-Out Cross Validation (LOGOCV)

Bellow is an example code from Kim-Anh to compute stability scores of indicators taxa/genes (selected on MINT sPLS dimension 1) across sampling trips.

library(mixOmics)

data(stemcells)
# 4 studies
summary(stemcells$study)

## STABILITY analysis, just learn on 3 data sets at a time, leave one out study
#e.g. here removing study k

list.selected = NULL # initialise, then we will store the selected genes at each iteration

for(k in 1:4){  # each run: remove study k
train.studies = which(stemcells$study != k)
X = stemcells$gene[train.studies,]
Y = stemcells$celltype[train.studies]
studies = droplevels(stemcells$study[train.studies])
# do a few checks (here this is not extensive!)
summary(Y)
summary(studies)

res.train = mint.splsda(X = X, Y = Y, ncomp = 2, study = studies, keepX = c(50,50))

# append selected genes 
list.selected = c(list.selected, selectVar(res.train)$name, comp = 1)
}

length(list.selected) # ok, we should have 50 genes selected on comp 1 * 4 runs = 200

table(list.selected)/4  # where 4 is the number of runs / studies we have done
sort(table(list.selected)/4, decreasing = TRUE)

We applied this on indicator taxa:

# 4 trips
summary(mint.spls2.WQ.taxa$study)
##  Trip_01_Nov-Dec_2019  Trip_02_January_2020 Trip_03_February_2020 
##                    44                    48                    43 
##     Trip_04_July_2020 
##                    56
IMOS_studies <- c("Trip_01_Nov-Dec_2019",
                  "Trip_02_January_2020",
                  "Trip_03_February_2020",
                  "Trip_04_July_2020")
## STABILITY analysis, just learn on 3 data sets at a time, leave one out study
#e.g. here removing study k

list.selected = NULL # initialise, then we will store the selected genes at each iteration

for(k in IMOS_studies){  # each run: remove study k
train.studies = which(mint.spls2.WQ.taxa$study != k)
X = OTUs_biplot_names[train.studies,]
Y = metadata_MINT_biplot[train.studies,]
IMOS.studies = droplevels(mint.spls2.WQ.taxa$study[train.studies])
# do a few checks (here this is not extensive!)
summary(Y)
summary(IMOS.studies)

res.train = mint.spls(X = X, Y = Y, ncomp = 2, study = IMOS.studies, keepX = c(50,50))

# append selected genes 
list.selected = c(list.selected, selectVar(res.train)$X$name, comp = 1)
}

# Saving this as a separate object for taxa
list.selected.taxa <- list.selected

length(list.selected.taxa) # ok, we should have 50 genes selected on comp 1 * 4 runs = 200
## [1] 227
table(list.selected.taxa)/4  # where 4 is the number of runs / studies we have done
## list.selected.taxa
##                                                                 1 
##                                                              1.00 
##                                 1046_Chromatiaceae; Unknown Genus 
##                                                              0.25 
##                        1068904_Roseobacteraceae; Primorskyibacter 
##                                                              0.25 
##                         1080193_Flavobacteriaceae; Hyunsoonleella 
##                                                              0.25 
##                                112040_Flavobacteriaceae; Zobellia 
##                                                              0.25 
##                         1123951_Phyllobacteriaceae; Thalassocella 
##                                                              0.25 
##                                1150_Unknown Family; Unknown Genus 
##                                                              0.75 
##                                1161_Unknown Family; Unknown Genus 
##                                                              0.25 
##                             1172191_Alteromonadaceae; Catenovulum 
##                                                              0.25 
##                               118_Planctomycetaceae; Planctomyces 
##                                                              0.25 
##                                118968_Coxiellaceae; Unknown Genus 
##                                                              0.25 
##                         119045_Methylobacteriaceae; Unknown Genus 
##                                                              0.25 
##                            119060_Burkholderiaceae; Unknown Genus 
##                                                              0.50 
##                            1195766_Rhodobacteraceae; Planktotalea 
##                                                              0.25 
##                          1211036_Flavobacteriaceae; Mangrovimonas 
##                                                              0.25 
##                             1220535_Unknown Family; Unknown Genus 
##                                                              0.25 
##                                1224_Unknown Family; Unknown Genus 
##                                                              0.75 
##                                1236_Unknown Family; Unknown Genus 
##                                                              0.25 
##                           1246884_Robiginitomaculaceae; Algimonas 
##                                                              0.25 
##                               125216_Acetobacteraceae; Roseomonas 
##                                                              0.25 
##                  125287_Ornithinimicrobiaceae; Ornithinimicrobium 
##                                                              0.25 
##             1263978_Rhodospirillaceae; Candidatus Endolissoclinum 
##                                                              0.50 
##                                  12916_Comamonadaceae; Acidovorax 
##                                                              0.25 
##                           1331809_Kordiimonadaceae; Unknown Genus 
##                                                              0.25 
##                                  1341118_Halieaceae; Luminiphilus 
##                                                              0.75 
##                              135613_Unknown Family; Unknown Genus 
##                                                              0.25 
##                              135617_Thiotrichaceae; Unknown Genus 
##                                                              0.25 
##                              135619_Unknown Family; Unknown Genus 
##                                                              0.75 
##                              135622_Unknown Family; Unknown Genus 
##                                                              0.75 
##      1389453_Candidatus Actinomarinaceae; Candidatus Actinomarina 
##                                                              0.25 
##                         1400386_Lacipirellulaceae; Bythopirellula 
##                                                              0.25 
##                          1406885_Alteromonadaceae; Aliiglaciecola 
##                                                              0.50 
##                                 1434034_Flavobacteriaceae; Pricia 
##                                                              0.25 
##                             1443919_Rhodobacteraceae; Tabrizicola 
##                                                              0.25 
##                                 1458928_Oscillatoriaceae; Okeania 
##                                                              0.25 
##                                  146_Spirochaetaceae; Spirochaeta 
##                                                              0.25 
##                         1471398_Prolixibacteraceae; Unknown Genus 
##                                                              0.25 
##                     1484898_Hyphomicrobiaceae; Methyloceanibacter 
##                                                              0.25 
##                                 149698_Oxalobacteraceae; Massilia 
##                                                              0.25 
##                            1501348_Amoebophilaceae; Unknown Genus 
##                                                              0.25 
##                                     150830_Stappiaceae; Roseibium 
##                                                              0.25 
##                                     152180_Ahrensiaceae; Ahrensia 
##                                                              0.25 
##                           1524249_Unknown Family; Pseudohongiella 
##                                                              0.25 
##                               1553903_Oligoflexaceae; Oligoflexus 
##                                                              0.25 
##                1564515_Haliscomenobacteraceae; Phaeodactylibacter 
##                                                              0.25 
##                           159345_Roseobacteraceae; Roseibacterium 
##                                                              0.25 
##                           1608457_Rhodobacteraceae; Aestuariivita 
##                                                              0.25 
##                             1617805_Rhodobacteraceae; Amylibacter 
##                                                              0.25 
##                            165697_Sphingomonadaceae; Sphingopyxis 
##                                                              0.50 
##                              167375_Prochlorococcaceae; Cyanobium 
##                                                              0.25 
##                    1676142_Wenzhouxiangellaceae; Wenzhouxiangella 
##                                                              0.25 
## 1680826_Candidatus Thalassarchaeaceae; Candidatus Thalassarchaeum 
##                                                              0.75 
##                             1706369_Unknown Family; Unknown Genus 
##                                                              0.25 
##                               171436_Rhodospirillaceae; Tistrella 
##                                                              0.25 
##                              171552_Prevotellaceae; Unknown Genus 
##                                                              0.25 
##                          1716_Corynebacteriaceae; Corynebacterium 
##                                                              0.25 
##                             1752734_Unknown Family; Unknown Genus 
##                                                              0.25 
##                                1760_Unknown Family; Unknown Genus 
##                                                              0.25 
##                         1775411_Rhodanobacteraceae; Unknown Genus 
##                                                              0.25 
##                        1792291_Cellvibrionaceae; Marinagarivorans 
##                                                              0.25 
##                     1803399_Unknown Family; Candidatus Peribacter 
##                                                              0.50 
##                       1804663_Rhodospirillaceae; Haematospirillum 
##                                                              0.25 
##                               1813606_Balneolaceae; Unknown Genus 
##                                                              0.25 
##                        1822464_Burkholderiaceae; Paraburkholderia 
##                                                              0.25 
##                                182709_Bacillaceae; Oceanobacillus 
##                                                              0.25 
##                              183963_Unknown Family; Unknown Genus 
##                                                              0.25 
##                           1847_Pseudonocardiaceae; Pseudonocardia 
##                                                              0.25 
##                          1853232_Hymenobacteraceae; Unknown Genus 
##                                                              0.25 
##                            186650_Methylobacteriaceae; Microvirga 
##                                                              0.25 
##                              186801_Unknown Family; Unknown Genus 
##                                                              0.25 
##                              186802_Unknown Family; Unknown Genus 
##                                                              0.75 
##                            186822_Paenibacillaceae; Unknown Genus 
##                                                              0.25 
##                             1869227_Unknown Family; Unknown Genus 
##                                                              0.25 
##                             1890424_Unknown Family; Unknown Genus 
##                                                              0.75 
##                           1890426_Synechococcaceae; Unknown Genus 
##                                                              0.25 
##                            1915401_Phyllobacteriaceae; Roseitalea 
##                                                              0.25 
##                               191767_Nannocystaceae; Plesiocystis 
##                                                              0.25 
##                         1931200_Rhodobacteraceae; Marinibacterium 
##                                                              0.75 
##                             194_Campylobacteraceae; Campylobacter 
##                                                              0.25 
##                         194924_Desulfovibrionaceae; Unknown Genus 
##                                                              0.25 
##                       1960290_Sphingosinicellaceae; Pacificimonas 
##                                                              0.25 
##                                   2_Unknown Family; Unknown Genus 
##                                                              0.75 
##                              200644_Unknown Family; Unknown Genus 
##                                                              0.25 
##                                 202746_Thiovulaceae; Sulfurimonas 
##                                                              0.25 
##                              204428_Unknown Family; Unknown Genus 
##                                                              0.25 
##                              204455_Unknown Family; Unknown Genus 
##                                                              0.75 
##                              204456_Rhodobacteraceae; Gemmobacter 
##                                                              0.25 
##                         213421_Desulfuromonadaceae; Unknown Genus 
##                                                              0.25 
##                              213422_Geobacteraceae; Unknown Genus 
##                                                              0.25 
##                            2146_Acholeplasmataceae; Unknown Genus 
##                                                              0.50 
##                           2299_Desulfobacteraceae; Desulfosarcina 
##                                                              0.25 
##                                2383_Lachnospiraceae; Epulopiscium 
##                                                              0.25 
##                                244698_Flavobacteriaceae; Gillisia 
##                                                              0.25 
##                               245186_Roseobacteraceae; Loktanella 
##                                                              0.25 
##                            246873_Crocinitomicaceae; Crocinitomix 
##                                                              0.25 
##                              252356_Flavobacteriaceae; Maribacter 
##                                                              0.25 
##                                  258255_Stappiaceae; Pseudovibrio 
##                                                              0.25 
##                              265488_Pirellulaceae; Rhodopirellula 
##                                                              0.25 
##                        265976_Ornithinimicrobiaceae; Serinicoccus 
##                                                              0.25 
##                                           270_Thermaceae; Thermus 
##                                                              0.25 
##                                274591_Phyllobacteriaceae; Hoeflea 
##                                                              0.25 
##                                 28105_Rhizobiaceae; Sinorhizobium 
##                                                              0.25 
##                               28211_Unknown Family; Unknown Genus 
##                                                              0.25 
##                               28216_Unknown Family; Unknown Genus 
##                                                              0.50 
##                               28221_Unknown Family; Unknown Genus 
##                                                              0.25 
##                           28222_Desulfobacteraceae; Desulfobacula 
##                                                              0.25 
##                              282682_Roseobacteraceae; Citreicella 
##                                                              0.25 
##                       28453_Sphingobacteriaceae; Sphingobacterium 
##                                                              0.25 
##                                 286_Pseudomonadaceae; Pseudomonas 
##                                                              0.25 
##                              288021_Kordiimonadaceae; Kordiimonas 
##                                                              0.25 
##                              291183_Flavobacteriaceae; Lacinutrix 
##                                                              0.25 
##                               315422_Roseobacteraceae; Palleronia 
##                                                              0.25 
##                           316625_Cellvibrionaceae; Saccharophagus 
##                                                              0.25 
##                         31957_Propionibacteriaceae; Unknown Genus 
##                                                              0.25 
##                               31969_Unknown Family; Unknown Genus 
##                                                              0.50 
##                             31989_Rhodobacteraceae; Unknown Genus 
##                                                              0.75 
##                             32033_Xanthomonadaceae; Unknown Genus 
##                                                              0.25 
##                             335927_Roseobacteraceae; Thalassobius 
##                                                              0.25 
##                           335928_Xanthobacteraceae; Unknown Genus 
##                                                              0.25 
##                                  336276_Flavobacteriaceae; Olleya 
##                                                              0.25 
##                                 356_Unknown Family; Unknown Genus 
##                                                              0.75 
##                                   357_Rhizobiaceae; Agrobacterium 
##                                                              0.25 
##                               366580_Alteromonadaceae; Bowmanella 
##                                                              0.25 
##                                367771_Roseobacteraceae; Marinovum 
##                                                              0.75 
##                             379068_Flavobacteriaceae; Galbibacter 
##                                                              0.25 
##                             379070_Flavobacteriaceae; Gilvibacter 
##                                                              0.25 
##                           404235_Roseobacteraceae; Maritimibacter 
##                                                              0.25 
##                                 404432_Halomonadaceae; Salinicola 
##                                                              0.25 
##                             41275_Caulobacteraceae; Brevundimonas 
##                                                              0.25 
##                            41294_Bradyrhizobiaceae; Unknown Genus 
##                                                              0.25 
##                            41295_Rhodospirillaceae; Unknown Genus 
##                                                              0.75 
##                            417127_Flavobacteriaceae; Zunongwangia 
##                                                              0.25 
##                            42054_Halomonadaceae; Chromohalobacter 
##                                                              0.25 
##                           436357_Roseobacteraceae; Thalassococcus 
##                                                              0.25 
##                        437504_Granulosicoccaceae; Granulosicoccus 
##                                                              0.25 
##                    437506_Robiginitomaculaceae; Robiginitomaculum 
##                                                              0.25 
##                             45404_Beijerinckiaceae; Unknown Genus 
##                                                              0.25 
##                                     455358_Balneolaceae; Balneola 
##                                                              0.75 
##                             468938_Puniceicoccaceae; Cerasicoccus 
##                                                              0.25 
##                                     478070_Stappiaceae; Labrenzia 
##                                                              0.50 
##                                      482_Neisseriaceae; Neisseria 
##                                                              0.25 
##                             49279_Flavobacteriaceae; Gelidibacter 
##                                                              0.25 
##                               51291_Unknown Family; Unknown Genus 
##                                                              0.25 
##                             543_Enterobacteriaceae; Unknown Genus 
##                                                              0.75 
##                              544448_Unknown Family; Unknown Genus 
##                                                              0.50 
##                               561_Enterobacteriaceae; Escherichia 
##                                                              0.50 
##                             568386_Sinobacteraceae; Unknown Genus 
##                                                              0.25 
##                                570_Enterobacteriaceae; Klebsiella 
##                                                              0.50 
##                            574899_Verrucomicrobiaceae; Haloferula 
##                                                              0.75 
##                               62680_Unknown Family; Unknown Genus 
##                                                              0.25 
##                                 649462_Balneolaceae; Gracilimonas 
##                                                              0.25 
##                      655184_Unknown Family; Candidatus Thioglobus 
##                                                              0.25 
##                          655352_Cohaesibacteraceae; Cohaesibacter 
##                                                              0.25 
##                               65842_Unknown Family; Unknown Genus 
##                                                              0.25 
##                       72276_Ectothiorhodospiraceae; Unknown Genus 
##                                                              0.25 
##                                  75_Caulobacteraceae; Caulobacter 
##                                                              0.25 
##                             759360_Oceanospirillaceae; Oleibacter 
##                                                              0.25 
##                               762641_Flavobacteriaceae; Muriicola 
##                                                              0.25 
##                                 76831_Flavobacteriaceae; Myroides 
##                                                              0.25 
##                               80864_Comamonadaceae; Unknown Genus 
##                                                              0.25 
##                                     80865_Comamonadaceae; Delftia 
##                                                              1.00 
##                              81_Hyphomicrobiaceae; Hyphomicrobium 
##                                                              0.25 
##                                 82115_Rhizobiaceae; Unknown Genus 
##                                                              0.25 
##                                            85413_Boseaceae; Bosea 
##                                                              0.25 
##                                866673_Marinifilaceae; Marinifilum 
##                                                              0.25 
##                                 904708_Arenicellaceae; Arenicella 
##                                                              0.25 
##                     907197_Pseudoalteromonadaceae; Psychrosphaera 
##                                                              0.25 
##                               91347_Unknown Family; Unknown Genus 
##                                                              0.25 
##                               914_Nitrosomonadaceae; Nitrosomonas 
##                                                              0.25 
##                                 976_Unknown Family; Unknown Genus 
##                                                              0.75 
##                    986106_Acidiferrobacteraceae; Acidiferrobacter 
##                                                              0.25
sort(table(list.selected.taxa)/4, decreasing = TRUE)
## list.selected.taxa
##                                                                 1 
##                                                              1.00 
##                                     80865_Comamonadaceae; Delftia 
##                                                              1.00 
##                                1150_Unknown Family; Unknown Genus 
##                                                              0.75 
##                                1224_Unknown Family; Unknown Genus 
##                                                              0.75 
##                                  1341118_Halieaceae; Luminiphilus 
##                                                              0.75 
##                              135619_Unknown Family; Unknown Genus 
##                                                              0.75 
##                              135622_Unknown Family; Unknown Genus 
##                                                              0.75 
## 1680826_Candidatus Thalassarchaeaceae; Candidatus Thalassarchaeum 
##                                                              0.75 
##                              186802_Unknown Family; Unknown Genus 
##                                                              0.75 
##                             1890424_Unknown Family; Unknown Genus 
##                                                              0.75 
##                         1931200_Rhodobacteraceae; Marinibacterium 
##                                                              0.75 
##                                   2_Unknown Family; Unknown Genus 
##                                                              0.75 
##                              204455_Unknown Family; Unknown Genus 
##                                                              0.75 
##                             31989_Rhodobacteraceae; Unknown Genus 
##                                                              0.75 
##                                 356_Unknown Family; Unknown Genus 
##                                                              0.75 
##                                367771_Roseobacteraceae; Marinovum 
##                                                              0.75 
##                            41295_Rhodospirillaceae; Unknown Genus 
##                                                              0.75 
##                                     455358_Balneolaceae; Balneola 
##                                                              0.75 
##                             543_Enterobacteriaceae; Unknown Genus 
##                                                              0.75 
##                            574899_Verrucomicrobiaceae; Haloferula 
##                                                              0.75 
##                                 976_Unknown Family; Unknown Genus 
##                                                              0.75 
##                            119060_Burkholderiaceae; Unknown Genus 
##                                                              0.50 
##             1263978_Rhodospirillaceae; Candidatus Endolissoclinum 
##                                                              0.50 
##                          1406885_Alteromonadaceae; Aliiglaciecola 
##                                                              0.50 
##                            165697_Sphingomonadaceae; Sphingopyxis 
##                                                              0.50 
##                     1803399_Unknown Family; Candidatus Peribacter 
##                                                              0.50 
##                            2146_Acholeplasmataceae; Unknown Genus 
##                                                              0.50 
##                               28216_Unknown Family; Unknown Genus 
##                                                              0.50 
##                               31969_Unknown Family; Unknown Genus 
##                                                              0.50 
##                                     478070_Stappiaceae; Labrenzia 
##                                                              0.50 
##                              544448_Unknown Family; Unknown Genus 
##                                                              0.50 
##                               561_Enterobacteriaceae; Escherichia 
##                                                              0.50 
##                                570_Enterobacteriaceae; Klebsiella 
##                                                              0.50 
##                                 1046_Chromatiaceae; Unknown Genus 
##                                                              0.25 
##                        1068904_Roseobacteraceae; Primorskyibacter 
##                                                              0.25 
##                         1080193_Flavobacteriaceae; Hyunsoonleella 
##                                                              0.25 
##                                112040_Flavobacteriaceae; Zobellia 
##                                                              0.25 
##                         1123951_Phyllobacteriaceae; Thalassocella 
##                                                              0.25 
##                                1161_Unknown Family; Unknown Genus 
##                                                              0.25 
##                             1172191_Alteromonadaceae; Catenovulum 
##                                                              0.25 
##                               118_Planctomycetaceae; Planctomyces 
##                                                              0.25 
##                                118968_Coxiellaceae; Unknown Genus 
##                                                              0.25 
##                         119045_Methylobacteriaceae; Unknown Genus 
##                                                              0.25 
##                            1195766_Rhodobacteraceae; Planktotalea 
##                                                              0.25 
##                          1211036_Flavobacteriaceae; Mangrovimonas 
##                                                              0.25 
##                             1220535_Unknown Family; Unknown Genus 
##                                                              0.25 
##                                1236_Unknown Family; Unknown Genus 
##                                                              0.25 
##                           1246884_Robiginitomaculaceae; Algimonas 
##                                                              0.25 
##                               125216_Acetobacteraceae; Roseomonas 
##                                                              0.25 
##                  125287_Ornithinimicrobiaceae; Ornithinimicrobium 
##                                                              0.25 
##                                  12916_Comamonadaceae; Acidovorax 
##                                                              0.25 
##                           1331809_Kordiimonadaceae; Unknown Genus 
##                                                              0.25 
##                              135613_Unknown Family; Unknown Genus 
##                                                              0.25 
##                              135617_Thiotrichaceae; Unknown Genus 
##                                                              0.25 
##      1389453_Candidatus Actinomarinaceae; Candidatus Actinomarina 
##                                                              0.25 
##                         1400386_Lacipirellulaceae; Bythopirellula 
##                                                              0.25 
##                                 1434034_Flavobacteriaceae; Pricia 
##                                                              0.25 
##                             1443919_Rhodobacteraceae; Tabrizicola 
##                                                              0.25 
##                                 1458928_Oscillatoriaceae; Okeania 
##                                                              0.25 
##                                  146_Spirochaetaceae; Spirochaeta 
##                                                              0.25 
##                         1471398_Prolixibacteraceae; Unknown Genus 
##                                                              0.25 
##                     1484898_Hyphomicrobiaceae; Methyloceanibacter 
##                                                              0.25 
##                                 149698_Oxalobacteraceae; Massilia 
##                                                              0.25 
##                            1501348_Amoebophilaceae; Unknown Genus 
##                                                              0.25 
##                                     150830_Stappiaceae; Roseibium 
##                                                              0.25 
##                                     152180_Ahrensiaceae; Ahrensia 
##                                                              0.25 
##                           1524249_Unknown Family; Pseudohongiella 
##                                                              0.25 
##                               1553903_Oligoflexaceae; Oligoflexus 
##                                                              0.25 
##                1564515_Haliscomenobacteraceae; Phaeodactylibacter 
##                                                              0.25 
##                           159345_Roseobacteraceae; Roseibacterium 
##                                                              0.25 
##                           1608457_Rhodobacteraceae; Aestuariivita 
##                                                              0.25 
##                             1617805_Rhodobacteraceae; Amylibacter 
##                                                              0.25 
##                              167375_Prochlorococcaceae; Cyanobium 
##                                                              0.25 
##                    1676142_Wenzhouxiangellaceae; Wenzhouxiangella 
##                                                              0.25 
##                             1706369_Unknown Family; Unknown Genus 
##                                                              0.25 
##                               171436_Rhodospirillaceae; Tistrella 
##                                                              0.25 
##                              171552_Prevotellaceae; Unknown Genus 
##                                                              0.25 
##                          1716_Corynebacteriaceae; Corynebacterium 
##                                                              0.25 
##                             1752734_Unknown Family; Unknown Genus 
##                                                              0.25 
##                                1760_Unknown Family; Unknown Genus 
##                                                              0.25 
##                         1775411_Rhodanobacteraceae; Unknown Genus 
##                                                              0.25 
##                        1792291_Cellvibrionaceae; Marinagarivorans 
##                                                              0.25 
##                       1804663_Rhodospirillaceae; Haematospirillum 
##                                                              0.25 
##                               1813606_Balneolaceae; Unknown Genus 
##                                                              0.25 
##                        1822464_Burkholderiaceae; Paraburkholderia 
##                                                              0.25 
##                                182709_Bacillaceae; Oceanobacillus 
##                                                              0.25 
##                              183963_Unknown Family; Unknown Genus 
##                                                              0.25 
##                           1847_Pseudonocardiaceae; Pseudonocardia 
##                                                              0.25 
##                          1853232_Hymenobacteraceae; Unknown Genus 
##                                                              0.25 
##                            186650_Methylobacteriaceae; Microvirga 
##                                                              0.25 
##                              186801_Unknown Family; Unknown Genus 
##                                                              0.25 
##                            186822_Paenibacillaceae; Unknown Genus 
##                                                              0.25 
##                             1869227_Unknown Family; Unknown Genus 
##                                                              0.25 
##                           1890426_Synechococcaceae; Unknown Genus 
##                                                              0.25 
##                            1915401_Phyllobacteriaceae; Roseitalea 
##                                                              0.25 
##                               191767_Nannocystaceae; Plesiocystis 
##                                                              0.25 
##                             194_Campylobacteraceae; Campylobacter 
##                                                              0.25 
##                         194924_Desulfovibrionaceae; Unknown Genus 
##                                                              0.25 
##                       1960290_Sphingosinicellaceae; Pacificimonas 
##                                                              0.25 
##                              200644_Unknown Family; Unknown Genus 
##                                                              0.25 
##                                 202746_Thiovulaceae; Sulfurimonas 
##                                                              0.25 
##                              204428_Unknown Family; Unknown Genus 
##                                                              0.25 
##                              204456_Rhodobacteraceae; Gemmobacter 
##                                                              0.25 
##                         213421_Desulfuromonadaceae; Unknown Genus 
##                                                              0.25 
##                              213422_Geobacteraceae; Unknown Genus 
##                                                              0.25 
##                           2299_Desulfobacteraceae; Desulfosarcina 
##                                                              0.25 
##                                2383_Lachnospiraceae; Epulopiscium 
##                                                              0.25 
##                                244698_Flavobacteriaceae; Gillisia 
##                                                              0.25 
##                               245186_Roseobacteraceae; Loktanella 
##                                                              0.25 
##                            246873_Crocinitomicaceae; Crocinitomix 
##                                                              0.25 
##                              252356_Flavobacteriaceae; Maribacter 
##                                                              0.25 
##                                  258255_Stappiaceae; Pseudovibrio 
##                                                              0.25 
##                              265488_Pirellulaceae; Rhodopirellula 
##                                                              0.25 
##                        265976_Ornithinimicrobiaceae; Serinicoccus 
##                                                              0.25 
##                                           270_Thermaceae; Thermus 
##                                                              0.25 
##                                274591_Phyllobacteriaceae; Hoeflea 
##                                                              0.25 
##                                 28105_Rhizobiaceae; Sinorhizobium 
##                                                              0.25 
##                               28211_Unknown Family; Unknown Genus 
##                                                              0.25 
##                               28221_Unknown Family; Unknown Genus 
##                                                              0.25 
##                           28222_Desulfobacteraceae; Desulfobacula 
##                                                              0.25 
##                              282682_Roseobacteraceae; Citreicella 
##                                                              0.25 
##                       28453_Sphingobacteriaceae; Sphingobacterium 
##                                                              0.25 
##                                 286_Pseudomonadaceae; Pseudomonas 
##                                                              0.25 
##                              288021_Kordiimonadaceae; Kordiimonas 
##                                                              0.25 
##                              291183_Flavobacteriaceae; Lacinutrix 
##                                                              0.25 
##                               315422_Roseobacteraceae; Palleronia 
##                                                              0.25 
##                           316625_Cellvibrionaceae; Saccharophagus 
##                                                              0.25 
##                         31957_Propionibacteriaceae; Unknown Genus 
##                                                              0.25 
##                             32033_Xanthomonadaceae; Unknown Genus 
##                                                              0.25 
##                             335927_Roseobacteraceae; Thalassobius 
##                                                              0.25 
##                           335928_Xanthobacteraceae; Unknown Genus 
##                                                              0.25 
##                                  336276_Flavobacteriaceae; Olleya 
##                                                              0.25 
##                                   357_Rhizobiaceae; Agrobacterium 
##                                                              0.25 
##                               366580_Alteromonadaceae; Bowmanella 
##                                                              0.25 
##                             379068_Flavobacteriaceae; Galbibacter 
##                                                              0.25 
##                             379070_Flavobacteriaceae; Gilvibacter 
##                                                              0.25 
##                           404235_Roseobacteraceae; Maritimibacter 
##                                                              0.25 
##                                 404432_Halomonadaceae; Salinicola 
##                                                              0.25 
##                             41275_Caulobacteraceae; Brevundimonas 
##                                                              0.25 
##                            41294_Bradyrhizobiaceae; Unknown Genus 
##                                                              0.25 
##                            417127_Flavobacteriaceae; Zunongwangia 
##                                                              0.25 
##                            42054_Halomonadaceae; Chromohalobacter 
##                                                              0.25 
##                           436357_Roseobacteraceae; Thalassococcus 
##                                                              0.25 
##                        437504_Granulosicoccaceae; Granulosicoccus 
##                                                              0.25 
##                    437506_Robiginitomaculaceae; Robiginitomaculum 
##                                                              0.25 
##                             45404_Beijerinckiaceae; Unknown Genus 
##                                                              0.25 
##                             468938_Puniceicoccaceae; Cerasicoccus 
##                                                              0.25 
##                                      482_Neisseriaceae; Neisseria 
##                                                              0.25 
##                             49279_Flavobacteriaceae; Gelidibacter 
##                                                              0.25 
##                               51291_Unknown Family; Unknown Genus 
##                                                              0.25 
##                             568386_Sinobacteraceae; Unknown Genus 
##                                                              0.25 
##                               62680_Unknown Family; Unknown Genus 
##                                                              0.25 
##                                 649462_Balneolaceae; Gracilimonas 
##                                                              0.25 
##                      655184_Unknown Family; Candidatus Thioglobus 
##                                                              0.25 
##                          655352_Cohaesibacteraceae; Cohaesibacter 
##                                                              0.25 
##                               65842_Unknown Family; Unknown Genus 
##                                                              0.25 
##                       72276_Ectothiorhodospiraceae; Unknown Genus 
##                                                              0.25 
##                                  75_Caulobacteraceae; Caulobacter 
##                                                              0.25 
##                             759360_Oceanospirillaceae; Oleibacter 
##                                                              0.25 
##                               762641_Flavobacteriaceae; Muriicola 
##                                                              0.25 
##                                 76831_Flavobacteriaceae; Myroides 
##                                                              0.25 
##                               80864_Comamonadaceae; Unknown Genus 
##                                                              0.25 
##                              81_Hyphomicrobiaceae; Hyphomicrobium 
##                                                              0.25 
##                                 82115_Rhizobiaceae; Unknown Genus 
##                                                              0.25 
##                                            85413_Boseaceae; Bosea 
##                                                              0.25 
##                                866673_Marinifilaceae; Marinifilum 
##                                                              0.25 
##                                 904708_Arenicellaceae; Arenicella 
##                                                              0.25 
##                     907197_Pseudoalteromonadaceae; Psychrosphaera 
##                                                              0.25 
##                               91347_Unknown Family; Unknown Genus 
##                                                              0.25 
##                               914_Nitrosomonadaceae; Nitrosomonas 
##                                                              0.25 
##                    986106_Acidiferrobacteraceae; Acidiferrobacter 
##                                                              0.25

Plotting MINT sPLS stability scores together with the heatmap, for taxa:

# Row names in this object (mint.spls2.WQ.taxa.mat.cor) is how indicator taxa are ordered in the heatmap, Use this when setting the level in the aes() of ggplot2

# This is the table with Stability scores on dimension 1
MINT_sPLS_dim1_stability <- as.data.frame(table(list.selected.taxa)/4) %>%
  separate(col = "list.selected.taxa", # I am splitting this column
           sep = "_", # This is the separator
           into = c("OTU", "taxa")
           )
# Removing the taxa column - not needed:
MINT_sPLS_dim1_stability$taxa <- NULL

# getting names for taxa
OTUs_biplot_colnames <- left_join(otu_table(megan_genus_clr) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU"),
                                  tax_table(megan_genus_clr) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU")) %>%
  unite(taxonomy, c(Family, Genus), sep = "; ") # Adding Taxonomy info
## Joining, by = "OTU"
OTUs_biplot_colnames <- OTUs_biplot_colnames %>% 
  dplyr::select("OTU", "taxonomy")

# Merging:
MINT_sPLS_dim1_stability <- left_join(MINT_sPLS_dim1_stability,
                                      OTUs_biplot_colnames)

# Re-running MINT - I need OTUs only as names (not OTUs and taxonomy)
MINT_sPLS_similarity_scores_and_LOGOCV_taxa <- mint.spls(X = OTUs_biplot,
                           Y = sample_data(megan_genus_clr)[,24:40],
                           ncomp = 2,
                           study = sample_data(megan_genus_clr)$Sampling_trip,
                           keepX = keepX, # 50 taxa on dims 1 and 2
                           mode = "regression")

# I am just making this object (cim_mint.spls2.WQ.taxa.OTUs) to merge with the stability scores based on OTU IDs
MINT_sPLS_similarity_scores_and_LOGOCV_taxa <- cim(MINT_sPLS_similarity_scores_and_LOGOCV_taxa,
    comp = 1:2,
    xlab = "WQ parameters",
    ylab = "Indicator microbial taxa",
    margins = c(19, # bottom
                27), # right
#    row.names = MINT_sPLS_ind_names_cim,
    symkey = FALSE,
    keysize = c(1, 0.4),
    title = "MINT sPLS Taxa/WQ (PCs 1 and 2)")
# Extracting the correlation matrix from the MINT sPLS heatmap
mint.spls2.WQ.taxa.mat.cor <- MINT_sPLS_similarity_scores_and_LOGOCV_taxa$mat.cor
# Merging them!
MINT_sPLS_dim1_stability_merged <- left_join(as.data.frame(mint.spls2.WQ.taxa.mat.cor) %>% rownames_to_column("OTU"),
                                             MINT_sPLS_dim1_stability) # %>%
#  filter(if_any(everything(), ~ !is.na(Freq)))  # removing those that have NAs as stability scores
# Barplots
MINT_sPLS_stability_plots_ordered.dim1 <- MINT_sPLS_dim1_stability_merged[,c(1, 19, 20)] %>% # Selecting OTU (1), Freq (19), and taxonomy (20) 
  ggplot(aes(y = factor(OTU, level = unique(row.names(as.data.frame(mint.spls2.WQ.taxa.mat.cor)))),
             x = Freq,
             fill =Freq > 0.25)) + # from Largest to smallest Stability
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("FALSE" = "grey40", "TRUE" = "seagreen3"), guide = FALSE) +
#  scale_y_discrete(limits=rev) + # Reversing the order to match the heatmap
  labs(y = 'MINT sPLS Indicator Microbes',
       x = "LOGOCV Stability score - dim 1",
       title = 'MINT sPLS Are these signals shared across trips?',
       subtitle = 'Leave One Group Out Cross Validation (LOGOCV)') +
      theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12), legend.position = "NONE")

MINT_sPLS_stability_plots_ordered.dim1

# Now exporting this for RawGraphs, this can go in Supplementary Material
RawGraphs_shared_taxa_MINT <- MINT_sPLS_stability_plots_ordered.dim1$data %>%
  as.data.frame() %>% 
  left_join(.,
            tax_table(megan_genus_clr) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU")) %>%
#  unite(full_taxonomy, c(Rank1, Rank2, Rank3, Rank4, Rank5, Rank6, Rank7), sep = "; ") %>%  # Adding Taxonomy info
  dplyr::filter(if_any(everything(), ~ !is.na(Freq)))
# Exporting as csv
write.csv(RawGraphs_shared_taxa_MINT, file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/output_tables/RawGraphs_shared_taxa_MINT.csv", quote = F, row.names = F)

and indicator GO terms:

# 4 trips
summary(mint.spls2.WQ.GOs$study)
##  Trip_01_Nov-Dec_2019  Trip_02_January_2020 Trip_03_February_2020 
##                    44                    48                    43 
##     Trip_04_July_2020 
##                    56
IMOS_studies <- c("Trip_01_Nov-Dec_2019",
                  "Trip_02_January_2020",
                  "Trip_03_February_2020",
                  "Trip_04_July_2020")
## STABILITY analysis, just learn on 3 data sets at a time, leave one out study
#e.g. here removing study k

list.selected = NULL # initialise, then we will store the selected genes at each iteration

for(k in IMOS_studies){  # each run: remove study k
train.studies = which(mint.spls2.WQ.GOs$study != k)
X = GOs_biplot_names[train.studies,]
Y = metadata_MINT_biplot[train.studies,]
IMOS.studies = droplevels(mint.spls2.WQ.GOs$study[train.studies])
# do a few checks (here this is not extensive!)
summary(Y)
summary(IMOS.studies)

res.train = mint.spls(X = X, Y = Y, ncomp = 2, study = IMOS.studies, keepX = c(50,50))

# append selected genes 
list.selected = c(list.selected, selectVar(res.train)$X$name, comp = 1)
}

# Saving this as a separate object for taxa
list.selected.GOs <- list.selected

length(list.selected.GOs) # ok, we should have 50 genes selected on comp 1 * 4 runs = 200
## [1] 204
table(list.selected.GOs)/4  # where 4 is the number of runs / studies we have done
## list.selected.GOs
##                                                                                          1 
##                                                                                       1.00 
##                                                      10074_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                      10108_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                       1015_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                       10226_IPR010226 NADH-quinone oxidoreductase, chain I 
##                                                                                       1.00 
##                                                   10228_GO:0016491 oxidoreductase activity 
##                                                                                       0.25 
##                                               1036_IPR001036 Acriflavin resistance protein 
##                                                                                       1.00 
##                                       10404_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.25 
##                                 1062_IPR001062 Transcription antitermination protein, NusG 
##                                                                                       0.25 
##                                                                1063_GO:0006412 translation 
##                                                                                       0.25 
##                                 11284_IPR011284 3-oxoacyl-(acyl-carrier-protein) reductase 
##                                                                                       0.25 
##                                                        115_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                  11537_IPR011537 NADH ubiquinone oxidoreductase, F subunit 
##                                                                                       0.50 
##                                              11701_IPR011701 Major facilitator superfamily 
##                                                                                       0.25 
##                       11806_IPR011806 Sulphite reductase, dissimilatory-type alpha subunit 
##                                                                                       0.25 
##                        1182_IPR001182 Probable peptidoglycan glycosyltransferase FtsW/RodA 
##                                                                                       0.25 
##                           11864_IPR011864 Phosphate ABC transporter, permease protein PstC 
##                                                                                       0.50 
##                              11890_IPR024704 Structural maintenance of chromosomes protein 
##                                                                                       0.25 
##                                             119_IPR000119 Histone-like DNA-binding protein 
##                                                                                       0.25 
##                                                         11900_IPR011900 Glutaredoxin, GrxC 
##                                                                                       0.25 
##                                                                1209_GO:0006412 translation 
##                                                                                       0.25 
##                                        12098_IPR012098 SRP-independent targeting protein 3 
##                                                                                       0.25 
##                                       12099_GO:0065003 protein-containing complex assembly 
##                                                                                       0.25 
##                                                      12147_GO:0016740 transferase activity 
##                                                                                       0.25 
##                                                      12245_GO:0009058 biosynthetic process 
##                                                                                       0.50 
##                                                               13025_GO:0006412 translation 
##                                                                                       0.50 
##                                                        131_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                  13765_IPR013765 DNA recombination and repair protein RecA 
##                                                                                       0.25 
##                                                                                   13954_NA 
##                                                                                       0.25 
##                                                      14105_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                              14358_IPR014358 Enoyl-[acyl-carrier-protein] reductase (NADH) 
##                                                                                       1.00 
##                                                       1441_GO:0016740 transferase activity 
##                                                                                       0.25 
##                                                     14434_IPR014434 Monothiol glutaredoxin 
##                                                                                       0.25 
##                                                  15_IPR000015 Outer membrane usher protein 
##                                                                                       0.25 
##                                                   15815_GO:0016491 oxidoreductase activity 
##                                                                                       0.50 
##                          1591_IPR001591 Influenza RNA-dependent RNA polymerase subunit PB2 
##                                                                                       1.00 
##                                                      16299_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                   16484_IPR016484 GTP-binding protein EngA 
##                                                                                       0.50 
##                                16932_IPR016932 Uncharacterised conserved protein UCP029669 
##                                                                                       0.25 
##                          17244_IPR017244 Ribosomal RNA large subunit methyltransferase K/L 
##                                                                                       0.50 
##                                       17649_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.25 
##  17666_IPR017666 2-aminoethylphosphonate ABC transport system, ATP-binding component PhnT2 
##                                                                                       0.25 
##              17847_IPR017847 Type VI secretion system, RhsGE-associated Vgr  family subset 
##                                                                                       0.25 
##                                               19007_IPR019007 WW domain binding protein 11 
##                                                                                       0.25 
##                                    19407_IPR019407 Cytoplasmic tRNA 2-thiolation protein 2 
##                                                                                       0.25 
##                                                                  1951_IPR001951 Histone H4 
##                                                                                       0.25 
##                                                                1971_GO:0006412 translation 
##                                                                                       0.50 
##                                                                                   19791_NA 
##                                                                                       0.50 
##                        2033_IPR002033 Sec-independent periplasmic protein translocase TatC 
##                                                                                       0.25 
##                           20761_IPR020761 Uncharacterised protein family UPF0114, bacteria 
##                                                                                       0.25 
##                                                      20921_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                                                   20948_NA 
##                                                                                       0.25 
##                                                        21120_GO:0016853 isomerase activity 
##                                                                                       0.25 
##                                                                2132_GO:0006412 translation 
##                                                                                       0.50 
##                                          2141_IPR002141 Influenza virus nucleoprotein (NP) 
##                                                                                       0.25 
##                                                                2150_GO:0006412 translation 
##                                                                                       0.25 
##                                                                 218_GO:0006412 translation 
##                                                                                       0.50 
##                                        2196_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.25 
##                                                   22270_GO:0016491 oxidoreductase activity 
##                                                                                       0.75 
##                                                                                   22271_NA 
##                                                                                       0.25 
##                                 22941_IPR022941 Signal recognition particle, SRP54 subunit 
##                                                                                       0.25 
##                                           2301_GO:0044281 small molecule metabolic process 
##                                                                                       0.25 
##                                           2302_GO:0044281 small molecule metabolic process 
##                                                                                       0.25 
##                                           2303_GO:0044281 small molecule metabolic process 
##                                                                                       0.25 
##                                                                                   23473_NA 
##                                                                                       0.25 
##                                             2381_IPR002381 Ribonuclease PH, bacterial-type 
##                                                                                       0.50 
##                            24791_GO:0006091 generation of precursor metabolites and energy 
##                                                                                       0.25 
##                                                       2504_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                                                    2549_NA 
##                                                                                       0.25 
##                                       25703_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.25 
##                                                   26030_IPR001248 Purine-cytosine permease 
##                                                                                       0.25 
##                                          27078_IPR027078 Small nuclear ribonucleoprotein E 
##                                                                                       0.25 
##                                                         27185_IPR017241 Toll-like receptor 
##                                                                                       0.50 
##                                                       2755_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                             2781_IPR002781 Transmembrane protein TauE-like 
##                                                                                       0.50 
##                                                          28268_IPR028268 Pianissimo family 
##                                                                                       0.50 
##                                                     2842_IPR002842 V-type ATPase subunit E 
##                                                                                       0.25 
##                                                                                   28927_NA 
##                                                                                       0.25 
##               2975_IPR001019 Guanine nucleotide binding protein (G-protein), alpha subunit 
##                                                                                       0.25 
##                                                               29751_GO:0006412 translation 
##                                                                                       0.25 
##                                                        2994_IPR002994 Surfeit locus 1/Shy1 
##                                                                                       0.25 
##                                      30559_IPR030559 DNA polymerase zeta catalytic subunit 
##                                                                                       0.25 
##                                                31463_IPR031463 MICOS complex subunit Mic12 
##                                                                                       0.25 
##                                                    3170_GO:0016491 oxidoreductase activity 
##                                                                                       0.25 
##                                                            31723_GO:0016829 lyase activity 
##                                                                                       0.75 
##                                                                                    3329_NA 
##                                                                                       0.25 
##                                                       3448_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                   3544_IPR003544 Cytochrome c-type biogenesis protein CcmB 
##                                                                                       0.25 
##                                                       3669_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                       3673_GO:0016740 transferase activity 
##                                                                                       0.25 
##                                                       3724_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                 3752_IPR003752 Disulphide bond formation protein DsbB/BdbC 
##                                                                                       0.25 
##                                                       3758_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                 3764_IPR003764 N-acetylglucosamine-6-phosphate deacetylase 
##                                                                                       0.25 
##                                      3837_IPR003837 Glu-tRNAGln amidotransferase C subunit 
##                                                                                       0.25 
##                                               394_IPR000394 RNA polymerase sigma factor 54 
##                                                                                       0.25 
##                                              4373_IPR004373 Peptide chain release factor 1 
##                                                                                       0.25 
##                                               4506_IPR004506 tRNA-specific 2-thiouridylase 
##                                                                                       0.25 
##                           4528_IPR004528 3-deoxy-D-manno-octulosonate cytidylyltransferase 
##                                                                                       0.25 
##                                                        453_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                  4536_IPR004536 Selenophosphate synthetase 
##                                                                                       0.25 
##                                                       4569_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                       4607_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                       4625_GO:0009058 biosynthetic process 
##                                                                                       0.50 
##                                   4695_IPR004695 Transporter protein SLAC1/Mae1/ Ssu1/TehA 
##                                                                                       0.25 
##                                                      4769_IPR004769 Adenylosuccinate lyase 
##                                                                                       0.25 
##                                                                                    4792_NA 
##                                                                                       0.25 
##                                        4811_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.50 
##                                                       4835_GO:0016740 transferase activity 
##                                                                                       0.25 
##                                         4903_IPR004903 Lactobacillus surface layer protein 
##                                                                                       0.25 
##                                                4923_IPR004923 Iron permease FTR1/Fip1/EfeU 
##                                                                                       0.25 
##                                                       5128_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                 5133_IPR005133 Na+/H+ antiporter subunit G 
##                                                                                       0.75 
##                                                       5150_GO:0009058 biosynthetic process 
##                                                                                       1.00 
##                                                                 5255_IPR005255 PdxA family 
##                                                                                       0.25 
##                                                                 529_GO:0006412 translation 
##                                                                                       0.25 
##                                        5338_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.50 
##                                      5650_IPR005650 BlaI transcriptional regulatory family 
##                                                                                       0.25 
##                               5670_IPR005670 Phosphate transport system permease protein 1 
##                                                                                       0.25 
##                                                                5704_GO:0006412 translation 
##                                                                                       0.25 
##                                                            5759_IPR005759 Endonuclease III 
##                                                                                       0.25 
##                                              577_GO:0005975 carbohydrate metabolic process 
##                                                                                       0.25 
##                                                                5813_GO:0006412 translation 
##                                                                                       0.25 
##                                                       5840_IPR005839 Methylthiotransferase 
##                                                                                       0.25 
##                                                       5930_GO:0009058 biosynthetic process 
##                                                                                       0.25 
## 5967_IPR005948 Thiamine/thiamin pyrophosphate-binding periplasmic protein, ABC transporter 
##                                                                                       0.25 
##                                                                 597_GO:0006412 translation 
##                                                                                       0.25 
##                                                       5982_IPR005982 Thioredoxin reductase 
##                                                                                       0.25 
##                                                                5996_GO:0006412 translation 
##                                                                                       0.25 
##                                                                6032_GO:0006412 translation 
##                                                                                       0.25 
##                                                          6035_GO:0046872 metal ion binding 
##                                                                                       0.25 
##                                        6130_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.50 
##                                                    6298_IPR006298 GTP-binding protein TypA 
##                                                                                       0.25 
##                                                                 630_GO:0006412 translation 
##                                                                                       0.25 
##                                                       639_IPR000639 Epoxide hydrolase-like 
##                                                                                       0.25 
##                                                                                     653_NA 
##                                                                                       0.25 
##                                         682_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.25 
##                                                                                    7016_NA 
##                                                                                       0.50 
##                                       7225_IPR007225 Exocyst complex component EXOC6/Sec15 
##                                                                                       0.25 
##                                        7269_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.25 
##                                   7305_IPR007305 Vesicle transport protein, Got1/SFT2-like 
##                                                                                       0.25 
##                                                       7315_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                        7325_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.50 
##                                                                                    7375_NA 
##                                                                                       0.25 
##                                                       7466_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                        748_IPR000748 Pseudouridine synthase, RsuA/RluB/E/F 
##                                                                                       0.25 
##                                                          7533_GO:0046872 metal ion binding 
##                                                                                       0.25 
##                                             7721_GO:0005975 carbohydrate metabolic process 
##                                                                                       0.25 
##                                                                                    7801_NA 
##                                                                                       0.25 
##                                       7812_IPR007812 Type II secretion system protein GspL 
##                                                                                       0.25 
##                                        8141_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.75 
##                                                         92_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                        926_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                93_IPR000093 DNA recombination protein RecR 
##                                                                                       0.25 
##                          9311_IPR009311 Interferon alpha-inducible protein IFI6/IFI27-like 
##                                                                                       0.25 
##                                         968_IPR000968 Influenza nuclear export protein NS2 
##                                                                                       0.25 
##                                                                                    9734_NA 
##                                                                                       0.25
sort(table(list.selected.GOs)/4, decreasing = TRUE)
## list.selected.GOs
##                                                                                          1 
##                                                                                       1.00 
##                                       10226_IPR010226 NADH-quinone oxidoreductase, chain I 
##                                                                                       1.00 
##                                               1036_IPR001036 Acriflavin resistance protein 
##                                                                                       1.00 
##                              14358_IPR014358 Enoyl-[acyl-carrier-protein] reductase (NADH) 
##                                                                                       1.00 
##                          1591_IPR001591 Influenza RNA-dependent RNA polymerase subunit PB2 
##                                                                                       1.00 
##                                                       5150_GO:0009058 biosynthetic process 
##                                                                                       1.00 
##                                                   22270_GO:0016491 oxidoreductase activity 
##                                                                                       0.75 
##                                                            31723_GO:0016829 lyase activity 
##                                                                                       0.75 
##                                                 5133_IPR005133 Na+/H+ antiporter subunit G 
##                                                                                       0.75 
##                                        8141_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.75 
##                                  11537_IPR011537 NADH ubiquinone oxidoreductase, F subunit 
##                                                                                       0.50 
##                           11864_IPR011864 Phosphate ABC transporter, permease protein PstC 
##                                                                                       0.50 
##                                                      12245_GO:0009058 biosynthetic process 
##                                                                                       0.50 
##                                                               13025_GO:0006412 translation 
##                                                                                       0.50 
##                                                   15815_GO:0016491 oxidoreductase activity 
##                                                                                       0.50 
##                                                   16484_IPR016484 GTP-binding protein EngA 
##                                                                                       0.50 
##                          17244_IPR017244 Ribosomal RNA large subunit methyltransferase K/L 
##                                                                                       0.50 
##                                                                1971_GO:0006412 translation 
##                                                                                       0.50 
##                                                                                   19791_NA 
##                                                                                       0.50 
##                                                                2132_GO:0006412 translation 
##                                                                                       0.50 
##                                                                 218_GO:0006412 translation 
##                                                                                       0.50 
##                                             2381_IPR002381 Ribonuclease PH, bacterial-type 
##                                                                                       0.50 
##                                                         27185_IPR017241 Toll-like receptor 
##                                                                                       0.50 
##                                             2781_IPR002781 Transmembrane protein TauE-like 
##                                                                                       0.50 
##                                                          28268_IPR028268 Pianissimo family 
##                                                                                       0.50 
##                                                       4625_GO:0009058 biosynthetic process 
##                                                                                       0.50 
##                                        4811_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.50 
##                                        5338_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.50 
##                                        6130_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.50 
##                                                                                    7016_NA 
##                                                                                       0.50 
##                                        7325_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.50 
##                                                      10074_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                      10108_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                       1015_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                   10228_GO:0016491 oxidoreductase activity 
##                                                                                       0.25 
##                                       10404_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.25 
##                                 1062_IPR001062 Transcription antitermination protein, NusG 
##                                                                                       0.25 
##                                                                1063_GO:0006412 translation 
##                                                                                       0.25 
##                                 11284_IPR011284 3-oxoacyl-(acyl-carrier-protein) reductase 
##                                                                                       0.25 
##                                                        115_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                              11701_IPR011701 Major facilitator superfamily 
##                                                                                       0.25 
##                       11806_IPR011806 Sulphite reductase, dissimilatory-type alpha subunit 
##                                                                                       0.25 
##                        1182_IPR001182 Probable peptidoglycan glycosyltransferase FtsW/RodA 
##                                                                                       0.25 
##                              11890_IPR024704 Structural maintenance of chromosomes protein 
##                                                                                       0.25 
##                                             119_IPR000119 Histone-like DNA-binding protein 
##                                                                                       0.25 
##                                                         11900_IPR011900 Glutaredoxin, GrxC 
##                                                                                       0.25 
##                                                                1209_GO:0006412 translation 
##                                                                                       0.25 
##                                        12098_IPR012098 SRP-independent targeting protein 3 
##                                                                                       0.25 
##                                       12099_GO:0065003 protein-containing complex assembly 
##                                                                                       0.25 
##                                                      12147_GO:0016740 transferase activity 
##                                                                                       0.25 
##                                                        131_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                  13765_IPR013765 DNA recombination and repair protein RecA 
##                                                                                       0.25 
##                                                                                   13954_NA 
##                                                                                       0.25 
##                                                      14105_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                       1441_GO:0016740 transferase activity 
##                                                                                       0.25 
##                                                     14434_IPR014434 Monothiol glutaredoxin 
##                                                                                       0.25 
##                                                  15_IPR000015 Outer membrane usher protein 
##                                                                                       0.25 
##                                                      16299_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                16932_IPR016932 Uncharacterised conserved protein UCP029669 
##                                                                                       0.25 
##                                       17649_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.25 
##  17666_IPR017666 2-aminoethylphosphonate ABC transport system, ATP-binding component PhnT2 
##                                                                                       0.25 
##              17847_IPR017847 Type VI secretion system, RhsGE-associated Vgr  family subset 
##                                                                                       0.25 
##                                               19007_IPR019007 WW domain binding protein 11 
##                                                                                       0.25 
##                                    19407_IPR019407 Cytoplasmic tRNA 2-thiolation protein 2 
##                                                                                       0.25 
##                                                                  1951_IPR001951 Histone H4 
##                                                                                       0.25 
##                        2033_IPR002033 Sec-independent periplasmic protein translocase TatC 
##                                                                                       0.25 
##                           20761_IPR020761 Uncharacterised protein family UPF0114, bacteria 
##                                                                                       0.25 
##                                                      20921_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                                                   20948_NA 
##                                                                                       0.25 
##                                                        21120_GO:0016853 isomerase activity 
##                                                                                       0.25 
##                                          2141_IPR002141 Influenza virus nucleoprotein (NP) 
##                                                                                       0.25 
##                                                                2150_GO:0006412 translation 
##                                                                                       0.25 
##                                        2196_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.25 
##                                                                                   22271_NA 
##                                                                                       0.25 
##                                 22941_IPR022941 Signal recognition particle, SRP54 subunit 
##                                                                                       0.25 
##                                           2301_GO:0044281 small molecule metabolic process 
##                                                                                       0.25 
##                                           2302_GO:0044281 small molecule metabolic process 
##                                                                                       0.25 
##                                           2303_GO:0044281 small molecule metabolic process 
##                                                                                       0.25 
##                                                                                   23473_NA 
##                                                                                       0.25 
##                            24791_GO:0006091 generation of precursor metabolites and energy 
##                                                                                       0.25 
##                                                       2504_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                                                    2549_NA 
##                                                                                       0.25 
##                                       25703_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.25 
##                                                   26030_IPR001248 Purine-cytosine permease 
##                                                                                       0.25 
##                                          27078_IPR027078 Small nuclear ribonucleoprotein E 
##                                                                                       0.25 
##                                                       2755_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                     2842_IPR002842 V-type ATPase subunit E 
##                                                                                       0.25 
##                                                                                   28927_NA 
##                                                                                       0.25 
##               2975_IPR001019 Guanine nucleotide binding protein (G-protein), alpha subunit 
##                                                                                       0.25 
##                                                               29751_GO:0006412 translation 
##                                                                                       0.25 
##                                                        2994_IPR002994 Surfeit locus 1/Shy1 
##                                                                                       0.25 
##                                      30559_IPR030559 DNA polymerase zeta catalytic subunit 
##                                                                                       0.25 
##                                                31463_IPR031463 MICOS complex subunit Mic12 
##                                                                                       0.25 
##                                                    3170_GO:0016491 oxidoreductase activity 
##                                                                                       0.25 
##                                                                                    3329_NA 
##                                                                                       0.25 
##                                                       3448_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                   3544_IPR003544 Cytochrome c-type biogenesis protein CcmB 
##                                                                                       0.25 
##                                                       3669_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                       3673_GO:0016740 transferase activity 
##                                                                                       0.25 
##                                                       3724_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                 3752_IPR003752 Disulphide bond formation protein DsbB/BdbC 
##                                                                                       0.25 
##                                                       3758_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                 3764_IPR003764 N-acetylglucosamine-6-phosphate deacetylase 
##                                                                                       0.25 
##                                      3837_IPR003837 Glu-tRNAGln amidotransferase C subunit 
##                                                                                       0.25 
##                                               394_IPR000394 RNA polymerase sigma factor 54 
##                                                                                       0.25 
##                                              4373_IPR004373 Peptide chain release factor 1 
##                                                                                       0.25 
##                                               4506_IPR004506 tRNA-specific 2-thiouridylase 
##                                                                                       0.25 
##                           4528_IPR004528 3-deoxy-D-manno-octulosonate cytidylyltransferase 
##                                                                                       0.25 
##                                                        453_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                  4536_IPR004536 Selenophosphate synthetase 
##                                                                                       0.25 
##                                                       4569_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                       4607_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                   4695_IPR004695 Transporter protein SLAC1/Mae1/ Ssu1/TehA 
##                                                                                       0.25 
##                                                      4769_IPR004769 Adenylosuccinate lyase 
##                                                                                       0.25 
##                                                                                    4792_NA 
##                                                                                       0.25 
##                                                       4835_GO:0016740 transferase activity 
##                                                                                       0.25 
##                                         4903_IPR004903 Lactobacillus surface layer protein 
##                                                                                       0.25 
##                                                4923_IPR004923 Iron permease FTR1/Fip1/EfeU 
##                                                                                       0.25 
##                                                       5128_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                                 5255_IPR005255 PdxA family 
##                                                                                       0.25 
##                                                                 529_GO:0006412 translation 
##                                                                                       0.25 
##                                      5650_IPR005650 BlaI transcriptional regulatory family 
##                                                                                       0.25 
##                               5670_IPR005670 Phosphate transport system permease protein 1 
##                                                                                       0.25 
##                                                                5704_GO:0006412 translation 
##                                                                                       0.25 
##                                                            5759_IPR005759 Endonuclease III 
##                                                                                       0.25 
##                                              577_GO:0005975 carbohydrate metabolic process 
##                                                                                       0.25 
##                                                                5813_GO:0006412 translation 
##                                                                                       0.25 
##                                                       5840_IPR005839 Methylthiotransferase 
##                                                                                       0.25 
##                                                       5930_GO:0009058 biosynthetic process 
##                                                                                       0.25 
## 5967_IPR005948 Thiamine/thiamin pyrophosphate-binding periplasmic protein, ABC transporter 
##                                                                                       0.25 
##                                                                 597_GO:0006412 translation 
##                                                                                       0.25 
##                                                       5982_IPR005982 Thioredoxin reductase 
##                                                                                       0.25 
##                                                                5996_GO:0006412 translation 
##                                                                                       0.25 
##                                                                6032_GO:0006412 translation 
##                                                                                       0.25 
##                                                          6035_GO:0046872 metal ion binding 
##                                                                                       0.25 
##                                                    6298_IPR006298 GTP-binding protein TypA 
##                                                                                       0.25 
##                                                                 630_GO:0006412 translation 
##                                                                                       0.25 
##                                                       639_IPR000639 Epoxide hydrolase-like 
##                                                                                       0.25 
##                                                                                     653_NA 
##                                                                                       0.25 
##                                         682_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.25 
##                                       7225_IPR007225 Exocyst complex component EXOC6/Sec15 
##                                                                                       0.25 
##                                        7269_GO:0006807 nitrogen compound metabolic process 
##                                                                                       0.25 
##                                   7305_IPR007305 Vesicle transport protein, Got1/SFT2-like 
##                                                                                       0.25 
##                                                       7315_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                                                    7375_NA 
##                                                                                       0.25 
##                                                       7466_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                        748_IPR000748 Pseudouridine synthase, RsuA/RluB/E/F 
##                                                                                       0.25 
##                                                          7533_GO:0046872 metal ion binding 
##                                                                                       0.25 
##                                             7721_GO:0005975 carbohydrate metabolic process 
##                                                                                       0.25 
##                                                                                    7801_NA 
##                                                                                       0.25 
##                                       7812_IPR007812 Type II secretion system protein GspL 
##                                                                                       0.25 
##                                                         92_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                        926_GO:0009058 biosynthetic process 
##                                                                                       0.25 
##                                                93_IPR000093 DNA recombination protein RecR 
##                                                                                       0.25 
##                          9311_IPR009311 Interferon alpha-inducible protein IFI6/IFI27-like 
##                                                                                       0.25 
##                                         968_IPR000968 Influenza nuclear export protein NS2 
##                                                                                       0.25 
##                                                                                    9734_NA 
##                                                                                       0.25

Plotting MINT sPLS stability scores together with the heatmap, for GO terms:

# Row names in this object (mint.spls2.WQ.taxa.mat.cor) is how indicator taxa are ordered in the heatmap, Use this when setting the level in the aes() of ggplot2

# This is the table with Stability scores on dimension 1
MINT_sPLS_dim1_stability_GOs <- as.data.frame(table(list.selected.GOs)/4) %>%
  separate(col = "list.selected.GOs", # I am splitting this column
           sep = "_", # This is the separator
           into = c("OTU", "GOs")
           )
# Removing the taxa column - not needed:
MINT_sPLS_dim1_stability_GOs$GOs <- NULL

# getting names for GOs
GOs_biplot_colnames <- left_join(otu_table(megan_go_clr_5) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU"),
                                  tax_table(megan_go_clr_5) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU")) %>%
  unite(Function, c(Rank3, Rank4, Rank5, Rank6), sep = "; ") # Adding Function info
## Joining, by = "OTU"
GOs_biplot_colnames <- GOs_biplot_colnames %>% 
  dplyr::select("OTU", "Function")

# Merging:
MINT_sPLS_dim1_stability_GOs <- left_join(MINT_sPLS_dim1_stability_GOs,
                                      GOs_biplot_colnames)

# Re-running MINT - I need GOs only as names (not GOs and actual annotations)
MINT_sPLS_similarity_scores_and_LOGOCV_GOs <- mint.spls(X = GOs_biplot,
                           Y = sample_data(megan_go_clr_5)[,24:40],
                           ncomp = 2,
                           study = sample_data(megan_go_clr_5)$Sampling_trip,
                           keepX = keepX, # 50 taxa on dims 1 and 2
                           mode = "regression")

# I am just making this object (cim_mint.spls2.WQ.taxa.OTUs) to merge with the stability scores based on OTU IDs
MINT_sPLS_similarity_scores_and_LOGOCV_GOs <- cim(MINT_sPLS_similarity_scores_and_LOGOCV_GOs,
    comp = 1:2,
    xlab = "WQ parameters",
    ylab = "Indicator microbial GO terms (genes/functions)",
    margins = c(19, # bottom
                50), # right
    row.names = MINT_sPLS_GOs_ind_names_cim,
    symkey = FALSE,
    keysize = c(1, 0.4),
    title = "MINT sPLS GOs/WQ (PCs 1 and 2)")
# Extracting the correlation matrix from the MINT sPLS heatmap
mint.spls2.WQ.GOs.mat.cor <- MINT_sPLS_similarity_scores_and_LOGOCV_GOs$mat
# Merging them!
MINT_sPLS_dim1_stability_merged_GOs <- left_join(as.data.frame(mint.spls2.WQ.GOs.mat.cor) %>% rownames_to_column("OTU"),
                                             MINT_sPLS_dim1_stability_GOs) # %>%
#  filter(if_any(everything(), ~ !is.na(Freq)))  # removing those that have NAs as stability scores
# Barplots
MINT_sPLS_stability_plots_ordered.dim1_GOs <- MINT_sPLS_dim1_stability_merged_GOs[,c(1, 19, 20)] %>% # Selecting OTU (1), Freq (19), and taxonomy (20) 
  ggplot(aes(y = factor(OTU, level = unique(row.names(as.data.frame(mint.spls2.WQ.GOs.mat.cor)))),
             x = Freq,
             fill = Freq > 0.25)) + # from Largest to smallest Stability
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("FALSE" = "grey40", "TRUE" = "seagreen3"), guide = FALSE) +
#  scale_y_discrete(limits=rev) + # Reversing the order to match the heatmap
  labs(y = 'MINT sPLS Indicator GO terms',
       x = "LOGOCV Stability score - dim 1",
       title = 'MINT sPLS Are these signals shared across trips?',
       subtitle = 'Leave One Group Out Cross Validation (LOGOCV)') +
      theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12), legend.position = "NONE")

MINT_sPLS_stability_plots_ordered.dim1_GOs

# Now exporting this for RawGraphs, this can go in Supplementary Material
RawGraphs_shared_GOs_MINT <- MINT_sPLS_stability_plots_ordered.dim1_GOs$data %>%
  as.data.frame() %>% 
  left_join(.,
            tax_table(megan_go_clr_5) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU")) %>%
#  unite(full_taxonomy, c(Rank1, Rank2, Rank3, Rank4, Rank5, Rank6, Rank7), sep = "; ") %>%  # Adding Taxonomy info
  dplyr::filter(if_any(everything(), ~ !is.na(Freq)))
# Exporting as csv
write.csv(RawGraphs_shared_taxa_MINT, file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/output_tables/RawGraphs_shared_GOs_MINT.csv", quote = F, row.names = F)

Numerical values - only for stable indicators (this will help me with text writing):

# First extracting similarity values from the MINT sPLS heatmap
MINT_sPLS_mat.corr <- left_join(cim_mint.spls2.WQ.taxa.OTUs[["mat"]] %>% 
  as.data.frame() %>% 
  rownames_to_column("OTU"),
  megan_genus_clr@tax_table %>% # Adding taxonomy info too
    as.data.frame() %>% 
    rownames_to_column("OTU")
)
# Now adding the stability scores!
MINT_sPLS_mat.corr_and_LOGOCV <- left_join(MINT_sPLS_mat.corr, RawGraphs_shared_taxa_MINT[,1:2])
# Visualising as a table
knitr::kable(MINT_sPLS_mat.corr_and_LOGOCV, caption = "MINT sPLS - numerical representation of similarity scores (partial correlations).")
MINT sPLS - numerical representation of similarity scores (partial correlations).
OTU median_POC_µM median_PN_µM median_Chlorophyll_A_µg_L median_PP_µM SALINITY_2.5m_RV median_DOC_µM median_Phaeophytin_A_µg_L median_TDN_µM FLUORESCENCE_2.5m_RV median_Si_µM median_NO2_µM median_TSS_mg_L SEAWATER_TEMPERATURE_2.5m_RV median_NH4_µM median_NO3_µM median_TDP_µM median_PO4_µM Domain Phylum Class Order Family Genus Species Freq
135619_Unknown Family; Unknown Genus 0.4033428 0.3610483 0.3550518 0.2666964 0.2069694 0.3203748 0.0669948 -0.1305209 -0.0820813 -0.1154535 -0.3702580 -0.2279649 -0.1782364 -0.2482898 -0.5055078 -0.2462484 -0.4499928 NA NA NA NA NA NA NA NA
1890424_Unknown Family; Unknown Genus 0.4000287 0.3577860 0.3504673 0.2632587 0.2043906 0.3202456 0.0627213 -0.1338292 -0.0834813 -0.1151821 -0.3719716 -0.2296177 -0.1794068 -0.2488255 -0.5050225 -0.2423465 -0.4467868 NA NA NA NA NA NA NA NA
28221_Unknown Family; Unknown Genus 0.4056784 0.3629686 0.3561470 0.2675225 0.2076618 0.3236726 0.0652372 -0.1338014 -0.0837521 -0.1165123 -0.3751429 -0.2313170 -0.1807870 -0.2512120 -0.5105491 -0.2465917 -0.4528818 NA NA NA NA NA NA NA NA
31989_Rhodobacteraceae; Unknown Genus 0.4055754 0.3626928 0.3550215 0.2666807 0.2070638 0.3251446 0.0629091 -0.1364873 -0.0850189 -0.1169032 -0.3780005 -0.2334474 -0.1823771 -0.2527475 -0.5126970 -0.2453627 -0.4530719 NA NA NA NA NA NA NA NA
119060_Burkholderiaceae; Unknown Genus 0.4057722 0.3627214 0.3543630 0.2661891 0.2067266 0.3265495 0.0610846 -0.1387361 -0.0860937 -0.1172973 -0.3805535 -0.2353174 -0.1837784 -0.2541534 -0.5147733 -0.2445457 -0.4535365 NA NA NA NA NA NA NA NA
186802_Unknown Family; Unknown Genus 0.4216702 0.3778952 0.3736732 0.2806748 0.2176846 0.3311964 0.0755955 -0.1299136 -0.0827148 -0.1196889 -0.3799840 -0.2330611 -0.1824027 -0.2557276 -0.5230028 -0.2602413 -0.4697065 NA NA NA NA NA NA NA NA
561_Enterobacteriaceae; Escherichia 0.4167212 0.3727744 0.3654223 0.2744912 0.2130936 0.3331127 0.0660767 -0.1385452 -0.0865536 -0.1198542 -0.3865506 -0.2385003 -0.1863708 -0.2586979 -0.5253690 -0.2528316 -0.4653330 NA NA NA NA NA NA NA NA
1150_Unknown Family; Unknown Genus 0.4134463 0.3710803 0.3695158 0.2775409 0.2150881 0.3200371 0.0811115 -0.1191547 -0.0772065 -0.1160831 -0.3636434 -0.2218955 -0.1738979 -0.2459040 -0.5059153 -0.2586931 -0.4596232 NA NA NA NA NA NA NA NA
976_Unknown Family; Unknown Genus 0.4066624 0.3646181 0.3613471 0.2714129 0.2104497 0.3179472 0.0750787 -0.1227321 -0.0785596 -0.1150336 -0.3636830 -0.2227075 -0.1743723 -0.2451221 -0.5022467 -0.2520759 -0.4527022 NA NA NA NA NA NA NA NA
1161_Unknown Family; Unknown Genus 0.3964776 0.3552035 0.3507030 0.2634237 0.2043393 0.3123777 0.0696384 -0.1238471 -0.0785758 -0.1128002 -0.3591222 -0.2205013 -0.1725247 -0.2414459 -0.4931755 -0.2439663 -0.4418341 NA NA NA NA NA NA NA NA
1236_Unknown Family; Unknown Genus 0.3880844 0.3460719 0.3341892 0.2510522 0.1952253 0.3194122 0.0478661 -0.1451089 -0.0882226 -0.1141043 -0.3774493 -0.2350566 -0.1832382 -0.2503776 -0.5027344 -0.2285596 -0.4351598 NA NA NA NA NA NA NA NA
1263978_Rhodospirillaceae; Candidatus Endolissoclinum 0.3852173 0.3430686 0.3292031 0.2473157 0.1924570 0.3208316 0.0418915 -0.1506509 -0.0907029 -0.1142837 -0.3818412 -0.2386434 -0.1858627 -0.2524166 -0.5045585 -0.2240347 -0.4326868 NA NA NA NA NA NA NA NA
91347_Unknown Family; Unknown Genus 0.3918159 0.3490571 0.3354727 0.2520234 0.1960858 0.3253809 0.0440168 -0.1515751 -0.0914722 -0.1159853 -0.3865838 -0.2413982 -0.1880501 -0.2557666 -0.5118144 -0.2285826 -0.4399128 NA NA NA NA NA NA NA NA
82115_Rhizobiaceae; Unknown Genus 0.3748928 0.3339565 0.3208460 0.2410357 0.1875443 0.3115329 0.0418096 -0.1453884 -0.0876919 -0.1110313 -0.3702775 -0.2312616 -0.1801445 -0.2449313 -0.4900099 -0.2185554 -0.4209526 NA NA NA NA NA NA NA NA
356_Unknown Family; Unknown Genus 0.4217166 0.3767941 0.3672718 0.2758896 0.2143148 0.3409058 0.0612169 -0.1468562 -0.0907404 -0.1223189 -0.3984042 -0.2467119 -0.1926050 -0.2657092 -0.5372354 -0.2530105 -0.4716571 NA NA NA NA NA NA NA NA
1224_Unknown Family; Unknown Genus 0.4138764 0.3695250 0.3589550 0.2696476 0.2095462 0.3368031 0.0567544 -0.1480376 -0.0909058 -0.1206495 -0.3952441 -0.2452736 -0.1913770 -0.2630693 -0.5305230 -0.2466293 -0.4633272 NA NA NA NA NA NA NA NA
570_Enterobacteriaceae; Klebsiella 0.4129173 0.3681086 0.3549652 0.2666619 0.2073970 0.3407642 0.0495706 -0.1559926 -0.0946247 -0.1216527 -0.4033370 -0.2513840 -0.1959247 -0.2673386 -0.5362420 -0.2424993 -0.4631843 NA NA NA NA NA NA NA NA
28216_Unknown Family; Unknown Genus 0.4047091 0.3611206 0.3497665 0.2627496 0.2042527 0.3312014 0.0527332 -0.1480110 -0.0904325 -0.1184799 -0.3900205 -0.2424585 -0.1890943 -0.2591546 -0.5214954 -0.2397718 -0.4534294 NA NA NA NA NA NA NA NA
41295_Rhodospirillaceae; Unknown Genus 0.3904162 0.3470986 0.3302621 0.2481238 0.1932718 0.3302424 0.0349000 -0.1615758 -0.0961378 -0.1172006 -0.3966480 -0.2490209 -0.1937194 -0.2610514 -0.5188136 -0.2232452 -0.4395238 NA NA NA NA NA NA NA NA
2_Unknown Family; Unknown Genus 0.4454294 0.4000795 0.3997550 0.3002475 0.2325983 0.3423102 0.0910808 -0.1240248 -0.0811203 -0.1243910 -0.3870541 -0.2355617 -0.1847353 -0.2623703 -0.5414112 -0.2805692 -0.4946908 NA NA NA NA NA NA NA NA
543_Enterobacteriaceae; Unknown Genus 0.4331295 0.3884009 0.3851590 0.2892973 0.2243015 0.3381989 0.0806217 -0.1299473 -0.0833067 -0.1224010 -0.3865141 -0.2365803 -0.1852563 -0.2606213 -0.5342879 -0.2688132 -0.4820790 NA NA NA NA NA NA NA NA
135622_Unknown Family; Unknown Genus 0.4317076 0.3916543 0.4094259 0.3074336 0.2370146 0.2987556 0.1373711 -0.0624354 -0.0512646 -0.1116289 -0.3124135 -0.1818086 -0.1442970 -0.2203214 -0.4763597 -0.2966994 -0.4729716 NA NA NA NA NA NA NA NA
204455_Unknown Family; Unknown Genus 0.4220560 0.3824063 0.3974997 0.2984876 0.2302551 0.2962394 0.1281081 -0.0683251 -0.0535686 -0.1102595 -0.3133386 -0.1836079 -0.1454532 -0.2196794 -0.4718104 -0.2869418 -0.4632147 NA NA NA NA NA NA NA NA
12916_Comamonadaceae; Acidovorax 0.4241158 0.3852801 0.4051201 0.3041908 0.2343712 0.2891565 0.1414183 -0.0537328 -0.0467618 -0.1084903 -0.2986635 -0.1724907 -0.1371854 -0.2119756 -0.4616149 -0.2947430 -0.4638011 NA NA NA NA NA NA NA NA
1501348_Amoebophilaceae; Unknown Genus 0.4204972 0.3817519 0.4003049 0.3005797 0.2316558 0.2887293 0.1371776 -0.0568443 -0.0480535 -0.1081165 -0.2999911 -0.1738924 -0.1381622 -0.2122661 -0.4606657 -0.2906972 -0.4602444 NA NA NA NA NA NA NA NA
265488_Pirellulaceae; Rhodopirellula 0.3806425 0.3435812 0.3511540 0.2637107 0.2037942 0.2781944 0.0991429 -0.0809117 -0.0574476 -0.1024225 -0.3035360 -0.1811183 -0.1427841 -0.2094662 -0.4416681 -0.2505135 -0.4199264 NA NA NA NA NA NA NA NA
367771_Roseobacteraceae; Marinovum 0.4086796 0.3676470 0.3700198 0.2779026 0.2151181 0.3091943 0.0908154 -0.1052626 -0.0703882 -0.1128097 -0.3458602 -0.2092615 -0.1643633 -0.2357085 -0.4896002 -0.2610789 -0.4529200 NA NA NA NA NA NA NA NA
1706369_Unknown Family; Unknown Genus 0.4024044 0.3620001 0.3643281 0.2736279 0.2118096 0.3044617 0.0893984 -0.1036727 -0.0693200 -0.1110816 -0.3405783 -0.2060697 -0.1618555 -0.2321047 -0.4821046 -0.2570587 -0.4459684 NA NA NA NA NA NA NA NA
1117_Unknown Family; Unknown Genus 0.3939522 0.3541367 0.3552108 0.2667855 0.2065891 0.3002660 0.0842496 -0.1053441 -0.0696866 -0.1093434 -0.3376033 -0.2048392 -0.1607709 -0.2294926 -0.4752012 -0.2500088 -0.4370329 NA NA NA NA NA NA NA NA
80865_Comamonadaceae; Delftia 0.4115579 0.3715627 0.3801038 0.2854500 0.2205724 0.3001444 0.1081540 -0.0863551 -0.0615791 -0.1105668 -0.3269641 -0.1949206 -0.1537023 -0.2258156 -0.4765953 -0.2713438 -0.4539058 NA NA NA NA NA NA NA NA
265976_Ornithinimicrobiaceae; Serinicoccus 0.4091966 0.3693561 0.3775013 0.2834971 0.2190847 0.2990554 0.1065919 -0.0869676 -0.0617505 -0.1101037 -0.3262911 -0.1946940 -0.1534869 -0.2251714 -0.4747886 -0.2693119 -0.4514259 NA NA NA NA NA NA NA NA
1406885_Alteromonadaceae; Aliiglaciecola 0.3582115 0.3215356 0.3203201 0.2405897 0.1864428 0.2770259 0.0706554 -0.1027889 -0.0666803 -0.1005057 -0.3145765 -0.1918912 -0.1503968 -0.2127892 -0.4379525 -0.2243245 -0.3981692 NA NA NA NA NA NA NA NA
165697_Sphingomonadaceae; Sphingopyxis 0.3589377 0.3219899 0.3198557 0.2402449 0.1862341 0.2792596 0.0683115 -0.1059237 -0.0682013 -0.1011619 -0.3183913 -0.1946356 -0.1524619 -0.2149412 -0.4412908 -0.2235243 -0.3993046 NA NA NA NA NA NA NA NA
167375_Prochlorococcaceae; Cyanobium 0.3434407 0.3071863 0.3009616 0.2260715 0.1755151 0.2748366 0.0540081 -0.1147102 -0.0715833 -0.0988594 -0.3191491 -0.1969851 -0.1539152 -0.2135164 -0.4334251 -0.2081446 -0.3835633 NA NA NA NA NA NA NA NA
183963_Unknown Family; Unknown Genus 0.3341531 0.2987922 0.2923328 0.2195917 0.1705106 0.2681398 0.0514536 -0.1128954 -0.0702571 -0.0963850 -0.3119159 -0.1926942 -0.1505272 -0.2084992 -0.4227819 -0.2019637 -0.3733351 NA NA NA NA NA NA NA NA
1890426_Synechococcaceae; Unknown Genus 0.3721298 0.3338817 0.3319364 0.2493176 0.1932500 0.2890355 0.0715478 -0.1089630 -0.0703036 -0.1047479 -0.3291664 -0.2011019 -0.1575519 -0.2223391 -0.4567947 -0.2321056 -0.4138846 NA NA NA NA NA NA NA NA
1680826_Candidatus Thalassarchaeaceae; Candidatus Thalassarchaeum 0.3640554 0.3255715 0.3187260 0.2394166 0.1858919 0.2917846 0.0565787 -0.1223854 -0.0762540 -0.1049155 -0.3391631 -0.2094446 -0.1636288 -0.2267969 -0.4601023 -0.2202996 -0.4066750 NA NA NA NA NA NA NA NA
213422_Geobacteraceae; Unknown Genus 0.3056509 0.2739048 0.2707727 0.2033842 0.1577445 0.2402011 0.0546014 -0.0943979 -0.0600649 -0.0867928 -0.2756829 -0.1691204 -0.1323537 -0.1855008 -0.3792944 -0.1885397 -0.3404961 NA NA NA NA NA NA NA NA
213421_Desulfuromonadaceae; Unknown Genus 0.3034515 0.2719144 0.2687149 0.2018389 0.1565517 0.2386370 0.0539641 -0.0940061 -0.0597688 -0.0862127 -0.2740113 -0.1681349 -0.1315743 -0.1843350 -0.3768058 -0.1870598 -0.3380782 NA NA NA NA NA NA NA NA
204441_Unknown Family; Unknown Genus 0.2685214 0.2344867 0.2032343 0.1527781 0.1203314 0.2630399 -0.0293990 -0.1739650 -0.0958783 -0.0903219 -0.3410260 -0.2218470 -0.1710320 -0.2164924 -0.4094469 -0.1265977 -0.3093450 NA NA NA NA NA NA NA NA
85006_Unknown Family; Unknown Genus 0.2562314 0.2238913 0.1947040 0.1463623 0.1152303 0.2498423 -0.0263305 -0.1639755 -0.0905300 -0.0858746 -0.3232165 -0.2100616 -0.1619846 -0.2053917 -0.3890093 -0.1216728 -0.2949590 NA NA NA NA NA NA NA NA
1655514_Pelagibacteraceae; Unknown Genus 0.2281492 0.1985453 0.1688089 0.1269156 0.1002014 0.2293008 -0.0336188 -0.1579755 -0.0862772 -0.0783135 -0.3007896 -0.1966748 -0.1514322 -0.1899201 -0.3563992 -0.1032041 -0.2639752 NA NA NA NA NA NA NA NA
1706372_Halieaceae; Unknown Genus 0.3165124 0.2781703 0.2495666 0.1875658 0.1471102 0.2950226 -0.0123008 -0.1787556 -0.1005589 -0.1023988 -0.3734213 -0.2403274 -0.1857796 -0.2397203 -0.4606013 -0.1605025 -0.3616819 NA NA NA NA NA NA NA NA
1239_Unknown Family; Unknown Genus 0.3168624 0.2779682 0.2469691 0.1856252 0.1457592 0.2996631 -0.0187311 -0.1865035 -0.1042456 -0.1036792 -0.3820311 -0.2466700 -0.1905264 -0.2444247 -0.4674328 -0.1574421 -0.3629287 NA NA NA NA NA NA NA NA
1341118_Halieaceae; Luminiphilus 0.4382632 0.3996907 0.4274216 0.3209071 0.2468184 0.2856079 0.1657596 -0.0324347 -0.0373871 -0.1085399 -0.2835578 -0.1596597 -0.1278729 -0.2054700 -0.4576783 -0.3144768 -0.4766823 NA NA NA NA NA NA NA NA
574899_Verrucomicrobiaceae; Haloferula 0.4300113 0.3917026 0.4167657 0.3129152 0.2407973 0.2841463 0.1568143 -0.0386771 -0.0399285 -0.1075556 -0.2856590 -0.1621694 -0.1295873 -0.2056308 -0.4547994 -0.3056169 -0.4684758 NA NA NA NA NA NA NA NA
1803399_Unknown Family; Candidatus Peribacter 0.4032001 0.3686355 0.3984233 0.2991183 0.2298097 0.2549547 0.1641042 -0.0161835 -0.0279289 -0.0977452 -0.2460459 -0.1358947 -0.1094284 -0.1810020 -0.4096268 -0.2951736 -0.4370138 NA NA NA NA NA NA NA NA
455358_Balneolaceae; Balneola 0.4922776 0.4517483 0.4958695 0.3722464 0.2855451 0.2971315 0.2214039 0.0050034 -0.0223728 -0.1155118 -0.2735204 -0.1459869 -0.1187097 -0.2064305 -0.4793893 -0.3710042 -0.5307840 NA NA NA NA NA NA NA NA
1931200_Rhodobacteraceae; Marinibacterium 0.4423731 0.4098223 0.4674196 0.3508208 0.2680919 0.2342509 0.2476827 0.0618267 0.0070444 -0.0949395 -0.1835511 -0.0850440 -0.0721918 -0.1517950 -0.3827854 -0.3579791 -0.4705452 NA NA NA NA NA NA NA NA
1813606_Balneolaceae; Unknown Genus 0.3422029 0.3157030 0.3541367 0.2658188 0.2034657 0.1923799 0.1749808 0.0282743 -0.0038099 -0.0764640 -0.1632154 -0.0815239 -0.0676054 -0.1289191 -0.3124805 -0.2685341 -0.3661889 NA NA NA NA NA NA NA NA
2146_Acholeplasmataceae; Unknown Genus 0.5356608 0.5002872 0.5887749 0.4418354 0.3366302 0.2494391 0.3507968 0.1347356 0.0368822 -0.1057053 -0.1572591 -0.0547897 -0.0514039 -0.1486032 -0.4133741 -0.4591447 -0.5630583 NA NA NA NA NA NA NA NA
544448_Unknown Family; Unknown Genus 0.5329406 0.4976408 0.5851881 0.4391455 0.3346064 0.2490685 0.3476826 0.1324831 0.0359522 -0.1054109 -0.1581631 -0.0557737 -0.0520862 -0.1487706 -0.4125881 -0.4561406 -0.5603749 NA NA NA NA NA NA NA NA
31969_Unknown Family; Unknown Genus 0.5406497 0.5059225 0.5997610 0.4500639 0.3426641 0.2435006 0.3663517 0.1504488 0.0440725 -0.1044547 -0.1430269 -0.0436629 -0.0431861 -0.1414860 -0.4051173 -0.4696212 -0.5666805 NA NA NA NA NA NA NA NA
662_Vibrionaceae; Vibrio 0.1993392 0.1707143 0.1319348 0.0992590 0.0793528 0.2237036 -0.0641149 -0.1789052 -0.0947406 -0.0747434 -0.3071867 -0.2047410 -0.1568977 -0.1899729 -0.3456238 -0.0726416 -0.2352265 NA NA NA NA NA NA NA NA
335929_Erythrobacteraceae; Unknown Genus 0.1810788 0.1538317 0.1128332 0.0849218 0.0683880 0.2137447 -0.0739085 -0.1809509 -0.0947916 -0.0707462 -0.2990603 -0.2008230 -0.1536131 -0.1834093 -0.3293991 -0.0580820 -0.2157464 NA NA NA NA NA NA NA NA
69277_Phyllobacteriaceae; Unknown Genus 0.1779742 0.1496224 0.1020364 0.0768408 0.0625469 0.2233862 -0.0924319 -0.2011350 -0.1041938 -0.0731330 -0.3192141 -0.2161225 -0.1649860 -0.1939566 -0.3432506 -0.0471001 -0.2146595 NA NA NA NA NA NA NA NA
2742_Marinobacteraceae; Marinobacter 0.1331937 0.1081237 0.0546461 0.0412723 0.0353694 0.1997851 -0.1176703 -0.2075889 -0.1049994 -0.0635527 -0.3008457 -0.2076709 -0.1577957 -0.1787056 -0.3046656 -0.0107789 -0.1670491 NA NA NA NA NA NA NA NA
1213_Prochloraceae; Unknown Genus 0.1529571 0.1228809 0.0555024 0.0419750 0.0367972 0.2403176 -0.1513249 -0.2574464 -0.1296031 -0.0759283 -0.3661729 -0.2538223 -0.1926711 -0.2164260 -0.3658281 -0.0042066 -0.1939733 NA NA NA NA NA NA NA NA
72275_Alteromonadaceae; Unknown Genus 0.0762656 0.0557564 -0.0034086 -0.0023067 0.0019737 0.1664918 -0.1448620 -0.2100361 -0.1032970 -0.0504835 -0.2712438 -0.1922926 -0.1451918 -0.1559321 -0.2507926 0.0329262 -0.1058776 NA NA NA NA NA NA NA NA
72274_Unknown Family; Unknown Genus 0.0438098 0.0246905 -0.0433304 -0.0322531 -0.0206605 0.1577561 -0.1756029 -0.2293615 -0.1108174 -0.0458042 -0.2738332 -0.1979566 -0.1487908 -0.1534910 -0.2350925 0.0655324 -0.0730138 NA NA NA NA NA NA NA NA
1218_Prochlorococcaceae; Prochlorococcus 0.0600530 0.0373776 -0.0394784 -0.0293222 -0.0178286 0.1863425 -0.1962329 -0.2620665 -0.1271215 -0.0546968 -0.3185440 -0.2292298 -0.1724789 -0.1796291 -0.2784344 0.0673867 -0.0942146 NA NA NA NA NA NA NA NA
1839_Nocardioidaceae; Nocardioides 0.0940781 0.0785494 0.0508821 0.0383348 0.0314534 0.1226697 -0.0556817 -0.1143478 -0.0588785 -0.0398993 -0.1774526 -0.1207040 -0.0920405 -0.1072460 -0.1881654 -0.0214552 -0.1143705 NA NA NA NA NA NA NA NA
135623_Unknown Family; Unknown Genus 0.0770208 0.0592405 0.0130884 0.0100280 0.0107013 0.1433211 -0.1093818 -0.1686805 -0.0837508 -0.0442690 -0.2267736 -0.1592368 -0.1205035 -0.1319374 -0.2169051 0.0146255 -0.1020539 NA NA NA NA NA NA NA NA
53246_Pseudoalteromonadaceae; Pseudoalteromonas 0.0420457 0.0285680 -0.0141183 -0.0104211 -0.0053593 0.1101638 -0.1071944 -0.1479532 -0.0721774 -0.0328031 -0.1844520 -0.1318959 -0.0993884 -0.1048749 -0.1651920 0.0319434 -0.0619782 NA NA NA NA NA NA NA NA
49_Polyangiaceae; Unknown Genus 0.0349249 0.0220151 -0.0213946 -0.0158831 -0.0095443 0.1060214 -0.1106284 -0.1482978 -0.0719827 -0.0311743 -0.1807911 -0.1300033 -0.0978350 -0.1020490 -0.1584855 0.0374267 -0.0543309 NA NA NA NA NA NA NA NA
390876_Thalassobaculaceae; Nisaea -0.0029652 -0.0147777 -0.0709625 -0.0530576 -0.0375284 0.1002695 -0.1531299 -0.1786407 -0.0844473 -0.0269144 -0.1922624 -0.1428790 -0.1067174 -0.1037745 -0.1466727 0.0788295 -0.0168374 NA NA NA NA NA NA NA NA
2745_Halomonadaceae; Halomonas 0.0020200 -0.0124173 -0.0784259 -0.0586211 -0.0412136 0.1220236 -0.1787679 -0.2113951 -0.1002090 -0.0331553 -0.2306474 -0.1707612 -0.1276514 -0.1251535 -0.1789969 0.0891404 -0.0258922 NA NA NA NA NA NA NA NA
1041_Erythrobacteraceae; Erythrobacter 0.0199626 0.0105310 -0.0238011 -0.0177294 -0.0115514 0.0779751 -0.0890757 -0.1151719 -0.0555437 -0.0225192 -0.1363493 -0.0987818 -0.0742106 -0.0762082 -0.1160496 0.0344322 -0.0344656 NA NA NA NA NA NA NA NA
1313115_Thalassospiraceae; Magnetovibrio -0.0720720 -0.0766999 -0.1321445 -0.0990130 -0.0731734 0.0459022 -0.1653881 -0.1571952 -0.0708183 -0.0072750 -0.1298201 -0.1045589 -0.0767310 -0.0617731 -0.0608296 0.1214139 0.0601593 NA NA NA NA NA NA NA NA
766_Unknown Family; Unknown Genus -0.0828928 -0.0862681 -0.1410048 -0.1056708 -0.0783757 0.0363089 -0.1657004 -0.1519463 -0.0677889 -0.0039075 -0.1179899 -0.0970368 -0.0708983 -0.0540847 -0.0458048 0.1272709 0.0724277 NA NA NA NA NA NA NA NA
1868329_Flavobacteriaceae; Xanthomarina 0.2905098 0.2814517 0.3764087 0.2823047 0.2126434 0.0495605 0.3177452 0.2230882 0.0910436 -0.0341382 0.0775781 0.0910287 0.0623546 0.0076118 -0.0985725 -0.3133444 -0.2885414 NA NA NA NA NA NA NA NA
261827_Flavobacteriaceae; Algibacter 0.2895329 0.2804477 0.3748187 0.2811130 0.2117575 0.0498806 0.3159528 0.2214862 0.0903340 -0.0341550 0.0763925 0.0900370 0.0616326 0.0070854 -0.0989543 -0.3119254 -0.2876667 NA NA NA NA NA NA NA NA
561367_Flavobacteriaceae; Salinimicrobium 0.2796548 0.2707874 0.3615115 0.2711339 0.2042593 0.0489586 0.3040134 0.2125649 0.0866057 -0.0332007 0.0723044 0.0858666 0.0587089 0.0060411 -0.0967211 -0.3006981 -0.2780053 NA NA NA NA NA NA NA NA
153265_Flavobacteriaceae; Aequorivita 0.2701101 0.2624642 0.3543531 0.2657524 0.2000167 0.0395103 0.3052049 0.2189209 0.0900954 -0.0299636 0.0846134 0.0938910 0.0648920 0.0138378 -0.0820229 -0.2962720 -0.2669902 NA NA NA NA NA NA NA NA
574559_Robiginitomaculaceae; Hellea 0.2988090 0.2926727 0.4050943 0.3037749 0.2281646 0.0240523 0.3668675 0.2765801 0.1159579 -0.0278296 0.1309492 0.1315537 0.0924774 0.0355338 -0.0619334 -0.3425022 -0.2914991 NA NA NA NA NA NA NA NA
225842_Flavobacteriaceae; Formosa 0.3645983 0.3511518 0.4606864 0.3455412 0.2607011 0.0797928 0.3726130 0.2491934 0.0996821 -0.0476038 0.0639367 0.0894626 0.0597377 -0.0085500 -0.1494926 -0.3800531 -0.3655815 NA NA NA NA NA NA NA NA
1608457_Rhodobacteraceae; Aestuariivita 0.3699578 0.3531503 0.4496229 0.3372876 0.2551379 0.1077441 0.3382620 0.2059926 0.0789547 -0.0555480 0.0139983 0.0530583 0.0324277 -0.0362303 -0.1909318 -0.3655428 -0.3762121 NA NA NA NA NA NA NA NA
117747_Unknown Family; Unknown Genus 0.2375288 0.2252696 0.2804004 0.2103658 0.1594494 0.0816032 0.1986959 0.1105083 0.0403935 -0.0390260 -0.0146231 0.0165616 0.0077389 -0.0360485 -0.1407970 -0.2253680 -0.2439838 NA NA NA NA NA NA NA NA
1123967_Porticoccaceae; Porticoccus 0.1820252 0.1707939 0.2045237 0.1534683 0.1167358 0.0780822 0.1291423 0.0574769 0.0180698 -0.0341128 -0.0407456 -0.0092079 -0.0104353 -0.0436230 -0.1306802 -0.1610378 -0.1900237 NA NA NA NA NA NA NA NA
489140_Geminicoccaceae; Geminicoccus 0.0641967 0.0691734 0.1225230 0.0917956 0.0677158 -0.0481202 0.1580750 0.1526779 0.0690750 0.0084370 0.1293784 0.1033229 0.0759611 0.0624666 0.0647831 -0.1135758 -0.0521658 NA NA NA NA NA NA NA NA
1649495_Flavobacteriaceae; Seonamhaeicola 0.0378089 0.0451591 0.0970754 0.0726886 0.0530063 -0.0657482 0.1487365 0.1553861 0.0716837 0.0150889 0.1472714 0.1135439 0.0841146 0.0752820 0.0929723 -0.0949651 -0.0233800 NA NA NA NA NA NA NA NA
1738655_Woeseiaceae; Woeseia 0.0157185 0.0224251 0.0609415 0.0456067 0.0328798 -0.0582387 0.1078008 0.1186850 0.0554140 0.0146337 0.1199443 0.0907360 0.0675013 0.0630981 0.0839406 -0.0626742 -0.0036532 NA NA NA NA NA NA NA NA
1284657_Rhodobacteraceae; Planktomarina 0.0152383 0.0244924 0.0745977 0.0558139 0.0400499 -0.0797582 0.1391606 0.1558335 0.0730300 0.0204897 0.1605470 0.1207821 0.0899645 0.0851447 0.1155188 -0.0782451 0.0010320 NA NA NA NA NA NA NA NA
358023_Flavobacteriaceae; Lutibacter 0.0038996 0.0136192 0.0605374 0.0452672 0.0320828 -0.0826401 0.1281681 0.1487844 0.0702618 0.0220784 0.1593192 0.1185638 0.0885281 0.0858225 0.1207546 -0.0667262 0.0124799 NA NA NA NA NA NA NA NA
1579505_Pirellulaceae; Rubripirellula 0.1082494 0.1075437 0.1553076 0.1164432 0.0871634 -0.0041300 0.1520074 0.1226735 0.0526521 -0.0066073 0.0718411 0.0657489 0.0470214 0.0260886 -0.0036155 -0.1337171 -0.1030801 NA NA NA NA NA NA NA NA
299261_Roseobacteraceae; Tateyamaria -0.0738414 -0.0591069 -0.0255825 -0.0193580 -0.0171259 -0.1178348 0.0757594 0.1274685 0.0640748 0.0371472 0.1802298 0.1250976 0.0949288 0.1063536 0.1792728 0.0006654 0.0939995 NA NA NA NA NA NA NA NA
1759396_Rhodobacteraceae; Marivivens -0.0400963 -0.0286964 0.0052722 0.0038144 0.0007956 -0.0927575 0.0839321 0.1195700 0.0586383 0.0279550 0.1525329 0.1084570 0.0818342 0.0873573 0.1395102 -0.0212323 0.0566904 NA NA NA NA NA NA NA NA
2433_Roseobacteraceae; Roseobacter -0.0606787 -0.0485497 -0.0209034 -0.0158184 -0.0140104 -0.0970085 0.0625203 0.1050588 0.0528010 0.0305738 0.1484418 0.1030496 0.0781950 0.0875791 0.1475779 0.0004128 0.0772785 NA NA NA NA NA NA NA NA
1217416_Halieaceae; Halioglobus -0.0642773 -0.0480723 -0.0032180 -0.0026090 -0.0048719 -0.1311760 0.1084897 0.1610164 0.0794809 0.0400740 0.2112319 0.1491848 0.1127427 0.1220111 0.1979693 -0.0208875 0.0874394 NA NA NA NA NA NA NA NA
75787_Rhodocyclaceae; Unknown Genus -0.1069751 -0.0864884 -0.0419069 -0.0316661 -0.0273628 -0.1634345 0.0989342 0.1719347 0.0867973 0.0518478 0.2472803 0.1709841 0.1298673 0.1465907 0.2490550 0.0064234 0.1347504 NA NA NA NA NA NA NA NA
475794_Halieaceae; Haliea -0.1178646 -0.0992655 -0.0685726 -0.0516347 -0.0419481 -0.1464399 0.0589840 0.1305795 0.0677607 0.0480272 0.2085534 0.1410170 0.1076851 0.1269065 0.2251232 0.0323174 0.1418653 NA NA NA NA NA NA NA NA
518755_Verrucomicrobiaceae; Roseibacillus -0.1836952 -0.1803605 -0.2515023 -0.1885926 -0.1415658 -0.0110818 -0.2310446 -0.1765131 -0.0743562 0.0161063 -0.0875407 -0.0860918 -0.0607508 -0.0256566 0.0326453 0.2133362 0.1784742 NA NA NA NA NA NA NA NA
1892252_Microcoleaceae; Unknown Genus -0.1275075 -0.1239053 -0.1673150 -0.1254803 -0.0944404 -0.0185912 -0.1441635 -0.1034482 -0.0425799 0.0141283 -0.0400564 -0.0444064 -0.0306959 -0.0065939 0.0386316 0.1399025 0.1260230 NA NA NA NA NA NA NA NA
267893_Idiomarinaceae; Unknown Genus -0.1899227 -0.1796115 -0.2213320 -0.1660584 -0.1259804 -0.0695568 -0.1524642 -0.0808193 -0.0287269 0.0323700 0.0198788 -0.0071731 -0.0016523 0.0332572 0.1188923 0.1769654 0.1959297 NA NA NA NA NA NA NA NA
119045_Methylobacteriaceae; Unknown Genus -0.2961772 -0.2680004 -0.2769560 -0.2079765 -0.1605335 -0.2108717 -0.0854583 0.0531729 0.0400664 0.0781823 0.2255584 0.1330528 0.1052149 0.1572324 0.3354683 0.1991200 0.3256465 NA NA NA NA NA NA NA NA
150830_Stappiaceae; Roseibium -0.3319441 -0.2997781 -0.3070950 -0.2306201 -0.1781779 -0.2413018 -0.0883941 0.0682830 0.0490196 0.0889669 0.2622304 0.1561138 0.1231471 0.1813289 0.3832556 0.2194400 0.3659467 NA NA NA NA NA NA NA NA
478070_Stappiaceae; Labrenzia -0.3631634 -0.3298811 -0.3467399 -0.2603556 -0.2006051 -0.2478369 -0.1207417 0.0464252 0.0402378 0.0929626 0.2561909 0.1480347 0.1177190 0.1817551 0.3956205 0.2522057 0.3971918 NA NA NA NA NA NA NA NA
# I want to save this as a csv:
write.csv(MINT_sPLS_mat.corr_and_LOGOCV, file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/output_tables/MINT_sPLS_sim_and LOGOCV_stab_scores_taxa.csv", quote = F, row.names = F)
# First extracting similarity values from the MINT sPLS heatmap
MINT_sPLS_mat.corr.GOs <- left_join(cim_mint.spls2.WQ.GOs[["mat"]] %>% 
  as.data.frame() %>% 
  rownames_to_column("OTU"),
  megan_go_clr_5@tax_table %>% # Adding taxonomy info too
    as.data.frame() %>% 
    rownames_to_column("OTU")
)
# Now adding the stability scores!
MINT_sPLS_mat.corr_and_LOGOCV_GOs <- left_join(MINT_sPLS_mat.corr.GOs, RawGraphs_shared_GOs_MINT[,1:2])
# Visualising as a table
knitr::kable(MINT_sPLS_mat.corr_and_LOGOCV_GOs, caption = "MINT sPLS - numerical representation of similarity scores (partial correlations).")
MINT sPLS - numerical representation of similarity scores (partial correlations).
OTU median_PN_µM median_Chlorophyll_A_µg_L median_POC_µM SALINITY_2.5m_RV median_PP_µM median_Phaeophytin_A_µg_L median_TSS_mg_L median_TDN_µM median_DOC_µM SEAWATER_TEMPERATURE_2.5m_RV FLUORESCENCE_2.5m_RV median_Si_µM median_NH4_µM median_NO2_µM median_NO3_µM median_PO4_µM median_TDP_µM Rank1 Rank2 Rank3 Rank4 Rank5 Rank6 Freq
4131_IPR004131 Pyrophosphate-energised proton pump 0.1679383 0.1696877 0.1338804 0.3330083 0.2635126 0.2460769 0.1902344 0.0070085 -0.2092062 0.0599995 0.0114677 -0.0781077 0.2051473 0.2799049 0.1734274 0.0265416 -0.1786604 NA NA NA NA NA NA NA
11400_IPR011400 Eukaryotic translation initiation factor 3 subunit B 0.1662056 0.1679843 0.1318197 0.3311321 0.2620205 0.2452760 0.1901574 0.0059766 -0.2105404 0.0599004 0.0114952 -0.0781252 0.2058323 0.2810541 0.1746662 0.0280886 -0.1772776 NA NA NA NA NA NA NA
7276_IPR007276 Nucleolar protein 14 0.1655500 0.1673781 0.1304902 0.3316845 0.2624486 0.2463796 0.1916546 0.0048095 -0.2138720 0.0602837 0.0116235 -0.0787983 0.2083589 0.2847564 0.1775857 0.0301476 -0.1771271 NA NA NA NA NA NA NA
20164_IPR020164 Cytochrome c oxidase assembly protein COX16 0.1726935 0.1743167 0.1401913 0.3366515 0.2664237 0.2465971 0.1886251 0.0107667 -0.2021732 0.0597697 0.0112517 -0.0772645 0.2005630 0.2728537 0.1671045 0.0205383 -0.1820106 NA NA NA NA NA NA NA
17423_IPR017423 tRNA (adenine(58)-N(1))-methyltransferase non-catalytic subunit TRM6 0.1888808 0.1904263 0.1566282 0.3606394 0.2854451 0.2612787 0.1971555 0.0164320 -0.2041769 0.0628495 0.0115992 -0.0805114 0.2057691 0.2788406 0.1680750 0.0136286 -0.1968364 NA NA NA NA NA NA NA
27108_IPR027108 Pre-mRNA-processing factor 6/Prp1/STA1 0.1869891 0.1885140 0.1551326 0.3568597 0.2824543 0.2584749 0.1949780 0.0163707 -0.2017563 0.0621641 0.0114673 -0.0796164 0.2034068 0.2756136 0.1660658 0.0132962 -0.1948155 NA NA NA NA NA NA NA
560_NA 0.1725739 0.1743615 0.1377201 0.3418688 0.2705257 0.2525000 0.1950847 0.0074058 -0.2142388 0.0615452 0.0117533 -0.0800887 0.2102147 0.2867733 0.1775711 0.0268874 -0.1834940 NA NA NA NA NA NA NA
2671_GO:0006412 translation 0.1685998 0.1704453 0.1331265 0.3372611 0.2668637 0.2503237 0.1945400 0.0052264 -0.2166168 0.0612163 0.0117878 -0.0799682 0.2112386 0.2886208 0.1798211 0.0300799 -0.1802326 NA NA NA NA NA NA NA
30661_IPR030661 SUMO-activating enzyme subunit Uba2 0.1680451 0.1699363 0.1319479 0.3378523 0.2673232 0.2513953 0.1959562 0.0041629 -0.2197125 0.0615818 0.0119079 -0.0806030 0.2135985 0.2920733 0.1825313 0.0319664 -0.1801419 NA NA NA NA NA NA NA
31120_IPR031120 WD repeat HIR1 0.1468733 0.1488188 0.1111284 0.3049193 0.2412190 0.2304562 0.1829147 -0.0022880 -0.2136018 0.0570342 0.0113078 -0.0755335 0.2039899 0.2802088 0.1782362 0.0391844 -0.1602905 NA NA NA NA NA NA NA
8576_GO:0006807 nitrogen compound metabolic process 0.1432243 0.1451537 0.1079036 0.2984084 0.2360634 0.2259171 0.1796577 -0.0028863 -0.2106794 0.0559721 0.0111264 -0.0742191 0.2008345 0.2760033 0.1758753 0.0394540 -0.1566226 NA NA NA NA NA NA NA
7720_GO:0009058 biosynthetic process 0.1575777 0.1593335 0.1239804 0.3162304 0.2502178 0.2350927 0.1830522 0.0042588 -0.2047339 0.0575535 0.0111122 -0.0752775 0.1992565 0.2723858 0.1700406 0.0293012 -0.1687505 NA NA NA NA NA NA NA
7704_GO:0009058 biosynthetic process 0.1591551 0.1607514 0.1277621 0.3135629 0.2481352 0.2309461 0.1778318 0.0078903 -0.1937211 0.0561852 0.0106783 -0.0729514 0.1907735 0.2600139 0.1604178 0.0227847 -0.1687172 NA NA NA NA NA NA NA
4299_NA 0.1070626 0.1096284 0.0645461 0.2600611 0.2055564 0.2101022 0.1790285 -0.0249190 -0.2403320 0.0541723 0.0117744 -0.0750123 0.2165804 0.3020784 0.2032972 0.0726825 -0.1280028 NA NA NA NA NA NA NA
2312_GO:0044281 small molecule metabolic process 0.1053821 0.1080028 0.0621680 0.2591129 0.2047945 0.2102966 0.1800076 -0.0264560 -0.2435776 0.0543667 0.0118825 -0.0754894 0.2188101 0.3054488 0.2061907 0.0752003 -0.1269191 NA NA NA NA NA NA NA
26872_IPR026872 Protein farnesyltransferase subunit beta 0.1135313 0.1160776 0.0709489 0.2700273 0.2134567 0.2163930 0.1828972 -0.0228891 -0.2419843 0.0555298 0.0119488 -0.0765105 0.2193439 0.3054552 0.2044233 0.0703652 -0.1340398 NA NA NA NA NA NA NA
8733_IPR008733 Peroxisomal biogenesis factor 11 0.1209279 0.1233341 0.0799601 0.2775433 0.2194390 0.2192627 0.1826297 -0.0181808 -0.2351854 0.0557887 0.0117856 -0.0761753 0.2155346 0.2992736 0.1981782 0.0631861 -0.1397969 NA NA NA NA NA NA NA
6084_NA 0.1173802 0.1199706 0.0739605 0.2777900 0.2195987 0.2221782 0.1874149 -0.0228087 -0.2470713 0.0569484 0.0122238 -0.0783696 0.2242801 0.3122081 0.2086515 0.0711259 -0.1381730 NA NA NA NA NA NA NA
22878_IPR022878 V-type ATP synthase catalytic alpha chain 0.1168367 0.1194122 0.0736599 0.2764076 0.2185063 0.2210423 0.1864309 -0.0226440 -0.2457120 0.0566527 0.0121582 -0.0779559 0.2230689 0.3105136 0.2074987 0.0706843 -0.1375048 NA NA NA NA NA NA NA
5946_GO:0009058 biosynthetic process 0.1229914 0.1254597 0.0810219 0.2829738 0.2237297 0.2237781 0.1865859 -0.0189184 -0.2407540 0.0569722 0.0120516 -0.0778419 0.2204601 0.3061783 0.2029085 0.0650752 -0.1423875 NA NA NA NA NA NA NA
1948_GO:0046872 metal ion binding 0.1411160 0.1432814 0.1025232 0.3027221 0.2394355 0.2322933 0.1875398 -0.0082003 -0.2270744 0.0580504 0.0117762 -0.0777232 0.2135168 0.2944762 0.1901897 0.0490372 -0.1568880 NA NA NA NA NA NA NA
3197_GO:0006091 generation of precursor metabolites and energy 0.1392345 0.1414313 0.1002926 0.3006687 0.2378024 0.2314056 0.1874367 -0.0093109 -0.2284870 0.0579373 0.0118045 -0.0777337 0.2142316 0.2956823 0.1915036 0.0506982 -0.1553817 NA NA NA NA NA NA NA
17132_IPR017132 Sm-like protein Lsm7 0.1440792 0.1462216 0.1056588 0.3068224 0.2426887 0.2346566 0.1887496 -0.0069843 -0.2267897 0.0585172 0.0118126 -0.0781639 0.2139472 0.2948191 0.1898025 0.0474327 -0.1595161 NA NA NA NA NA NA NA
27097_IPR027097 Mitotic spindle checkpoint protein Mad2 0.1481427 0.1501719 0.1111289 0.3097586 0.2450371 0.2349039 0.1871604 -0.0036637 -0.2203833 0.0582618 0.0116115 -0.0773499 0.2097118 0.2883354 0.1840556 0.0420960 -0.1623267 NA NA NA NA NA NA NA
8384_GO:0065003 protein-containing complex assembly 0.1199790 0.1221998 0.0817208 0.2698826 0.2134051 0.2114324 0.1745672 -0.0146648 -0.2210617 0.0535233 0.0111808 -0.0726828 0.2039947 0.2827329 0.1859779 0.0562893 -0.1370809 NA NA NA NA NA NA NA
4567_GO:0009058 biosynthetic process 0.0914410 0.0938162 0.0524922 0.2281671 0.1803229 0.1861898 0.1602235 -0.0250065 -0.2188171 0.0482853 0.0106219 -0.0672623 0.1958495 0.2736667 0.1853836 0.0691425 -0.1111129 NA NA NA NA NA NA NA
362_NA 0.2046618 0.2056808 0.1791181 0.3691809 0.2923147 0.2590503 0.1875235 0.0310881 -0.1728888 0.0609037 0.0105507 -0.0758395 0.1841803 0.2462534 0.1402005 -0.0104376 -0.2069066 NA NA NA NA NA NA NA
25655_IPR025655 Peroxisomal membrane protein 14 0.2002543 0.2013311 0.1741162 0.3638582 0.2880862 0.2563997 0.1866623 0.0288018 -0.1750501 0.0604681 0.0105691 -0.0755936 0.1849340 0.2477521 0.1422831 -0.0071451 -0.2032267 NA NA NA NA NA NA NA
7315_GO:0009058 biosynthetic process 0.2075643 0.2084592 0.1836455 0.3698545 0.2928725 0.2576404 0.1846669 0.0343358 -0.1651247 0.0602467 0.0102740 -0.0745064 0.1785977 0.2379370 0.1333315 -0.0159116 -0.2084938 NA NA NA NA NA NA NA
30468_IPR030468 NEDD8-activating enzyme E1 catalytic subunit 0.1904936 0.1917326 0.1625507 0.3531918 0.2796035 0.2517792 0.1861108 0.0230490 -0.1823469 0.0598771 0.0107145 -0.0756410 0.1886171 0.2539731 0.1490719 0.0014554 -0.1954085 NA NA NA NA NA NA NA
6846_GO:0006412 translation 0.1845944 0.1858245 0.1570943 0.3432243 0.2717077 0.2450631 0.1815203 0.0217384 -0.1788724 0.0583462 0.0104733 -0.0738107 0.1845185 0.2486191 0.1463389 0.0025429 -0.1896436 NA NA NA NA NA NA NA
16656_GO:0009058 biosynthetic process 0.2019918 0.2031821 0.1741336 0.3704437 0.2932821 0.2624448 0.1924273 0.0269424 -0.1842464 0.0621356 0.0109812 -0.0780596 0.1926969 0.2587761 0.1501740 -0.0032045 -0.2060025 NA NA NA NA NA NA NA
2738_IPR002738 RNase P subunit p30 0.1995368 0.2007823 0.1710191 0.3682330 0.2915198 0.2618088 0.1928594 0.0252049 -0.1871389 0.0621443 0.0110619 -0.0783208 0.1944714 0.2615629 0.1527982 -0.0004902 -0.2041755 NA NA NA NA NA NA NA
11603_IPR011603 2-oxoglutarate dehydrogenase E1 component 0.2081109 0.2092116 0.1812117 0.3775266 0.2989114 0.2657828 0.1932519 0.0303052 -0.1805588 0.0626381 0.0109270 -0.0782389 0.1910994 0.2559012 0.1466870 -0.0081339 -0.2110208 NA NA NA NA NA NA NA
6886_GO:0006351 transcription, DNA-templated 0.2451175 0.2454221 0.2276594 0.4120010 0.3263807 0.2766529 0.1881296 0.0557869 -0.1395339 0.0628903 0.0098182 -0.0749094 0.1664193 0.2168725 0.1093653 -0.0477064 -0.2389009 NA NA NA NA NA NA NA
12762_GO:0009058 biosynthetic process 0.2403300 0.2408130 0.2205694 0.4100234 0.3247792 0.2780128 0.1917936 0.0509631 -0.1503999 0.0636851 0.0101936 -0.0766507 0.1740709 0.2283469 0.1190130 -0.0396890 -0.2360271 NA NA NA NA NA NA NA
511_IPR000511 Cytochrome c/c1 haem-lyase 0.2293119 0.2300678 0.2062249 0.4009425 0.3175313 0.2760950 0.1947495 0.0426483 -0.1652643 0.0640045 0.0106343 -0.0782668 0.1835446 0.2430301 0.1324224 -0.0265255 -0.2280757 NA NA NA NA NA NA NA
2755_GO:0009058 biosynthetic process 0.2044722 0.2052060 0.1830277 0.3594818 0.2846851 0.2483834 0.1760362 0.0368162 -0.1517751 0.0577282 0.0096665 -0.0708290 0.1672021 0.2218050 0.1219038 -0.0213517 -0.2039518 NA NA NA NA NA NA NA
9244_IPR009244 Mediator complex, subunit Med7 0.2449597 0.2450257 0.2309328 0.4038840 0.3199956 0.2677254 0.1785149 0.0605818 -0.1218613 0.0602325 0.0090781 -0.0707158 0.1522085 0.1964091 0.0940507 -0.0568421 -0.2364285 NA NA NA NA NA NA NA
29751_GO:0006412 translation 0.2473117 0.2472180 0.2354499 0.4024816 0.3189154 0.2644120 0.1738446 0.0644121 -0.1112071 0.0590506 0.0086718 -0.0686070 0.1441853 0.1846272 0.0847020 -0.0635523 -0.2371394 NA NA NA NA NA NA NA
2994_IPR002994 Surfeit locus 1/Shy1 0.2594934 0.2594860 0.2457420 0.4253033 0.3369809 0.2807755 0.1860310 0.0657410 -0.1233957 0.0629582 0.0093790 -0.0735685 0.1566704 0.2014784 0.0946925 -0.0631840 -0.2497049 NA NA NA NA NA NA NA
10971_GO:0009058 biosynthetic process 0.1758571 0.1767942 0.1530261 0.3192473 0.2527668 0.2248484 0.1635805 0.0254661 -0.1530929 0.0530073 0.0092551 -0.0662352 0.1618974 0.2168392 0.1244018 -0.0066033 -0.1783842 NA NA NA NA NA NA NA
5279_IPR000109 Proton-dependent oligopeptide transporter family 0.1722869 0.1730662 0.1519089 0.3081983 0.2440430 0.2151937 0.1547368 0.0277594 -0.1397562 0.0504086 0.0086404 -0.0624790 0.1504059 0.2006137 0.1130080 -0.0118018 -0.1734139 NA NA NA NA NA NA NA
16461_GO:0016740 transferase activity 0.1412733 0.1423577 0.1181772 0.2673818 0.2116438 0.1927954 0.1446109 0.0137413 -0.1474329 0.0462221 0.0084552 -0.0589733 0.1496687 0.2024543 0.1211329 0.0074403 -0.1465273 NA NA NA NA NA NA NA
13025_GO:0006412 translation 0.3561798 0.3509502 0.4121626 0.4119027 0.3273750 0.1938749 0.0475415 0.1959769 0.2154956 0.0291256 -0.0031875 -0.0102409 -0.0936700 -0.1683203 -0.2037072 -0.2873700 -0.2919920 NA NA NA NA NA NA NA
2141_IPR002141 Influenza virus nucleoprotein (NP) 0.3365515 0.3315044 0.3909663 0.3857206 0.3065949 0.1793097 0.0407103 0.1873199 0.2114196 0.0263595 -0.0033373 -0.0078377 -0.0947644 -0.1680600 -0.1992436 -0.2755999 -0.2748725 NA NA NA NA NA NA NA
27185_IPR017241 Toll-like receptor 0.3583054 0.3528018 0.4181063 0.4063620 0.3230382 0.1861186 0.0381538 0.2020677 0.2346937 0.0266331 -0.0039540 -0.0060790 -0.1085969 -0.1900294 -0.2204528 -0.2984232 -0.2913726 NA NA NA NA NA NA NA
5133_IPR005133 Na+/H+ antiporter subunit G 0.3909856 0.3851747 0.4534489 0.4498353 0.3575428 0.2102368 0.0493840 0.2165541 0.2417455 0.0311989 -0.0037156 -0.0100176 -0.1069879 -0.1907697 -0.2281147 -0.3181584 -0.3198408 NA NA NA NA NA NA NA
2132_GO:0006412 translation 0.3825095 0.3768380 0.4434245 0.4405289 0.3501420 0.2061756 0.0488522 0.2115853 0.2355069 0.0306711 -0.0035934 -0.0100357 -0.1038681 -0.1854806 -0.2223043 -0.3107409 -0.3130386 NA NA NA NA NA NA NA
4769_IPR004769 Adenylosuccinate lyase 0.3378194 0.3321185 0.4015439 0.3662701 0.2913105 0.1566916 0.0155891 0.2008866 0.2590264 0.0194908 -0.0053033 0.0031691 -0.1326695 -0.2228012 -0.2405789 -0.3010419 -0.2697352 NA NA NA NA NA NA NA
22270_GO:0016491 oxidoreductase activity 0.3297492 0.3242848 0.3905109 0.3608274 0.2869522 0.1566337 0.0192154 0.1940528 0.2454326 0.0201276 -0.0048676 0.0013473 -0.1235596 -0.2089182 -0.2284107 -0.2899893 -0.2642680 NA NA NA NA NA NA NA
10226_IPR010226 NADH-quinone oxidoreductase, chain I 0.3577567 0.3518360 0.4235679 0.3917303 0.3115258 0.1702224 0.0211566 0.2103776 0.2657061 0.0219224 -0.0052571 0.0013267 -0.1335950 -0.2260010 -0.2473146 -0.3143213 -0.2867894 NA NA NA NA NA NA NA
5150_GO:0009058 biosynthetic process 0.4374276 0.4312770 0.5022819 0.5148125 0.4090918 0.2480740 0.0692089 0.2351738 0.2446074 0.0387532 -0.0030781 -0.0173028 -0.0989586 -0.1835461 -0.2327959 -0.3424718 -0.3612413 NA NA NA NA NA NA NA
1036_IPR001036 Acriflavin resistance protein 0.3982781 0.3914287 0.4752459 0.4275992 0.3401257 0.1800303 0.0132750 0.2394360 0.3148369 0.0215718 -0.0066469 0.0059650 -0.1639957 -0.2736023 -0.2918306 -0.3598468 -0.3167625 NA NA NA NA NA NA NA
14358_IPR014358 Enoyl-[acyl-carrier-protein] reductase (NADH) 0.3871416 0.3805612 0.4608452 0.4181960 0.3326232 0.1778415 0.0159908 0.2311701 0.3003161 0.0218197 -0.0062225 0.0044502 -0.1548239 -0.2593431 -0.2787135 -0.3468042 -0.3086592 NA NA NA NA NA NA NA
1591_IPR001591 Influenza RNA-dependent RNA polymerase subunit PB2 0.4072700 0.4008344 0.4778228 0.4559710 0.3625253 0.2049528 0.0362065 0.2333255 0.2800291 0.0282983 -0.0050478 -0.0037827 -0.1340762 -0.2313289 -0.2620781 -0.3461190 -0.3294414 NA NA NA NA NA NA NA
20948_NA 0.2950401 0.2887744 0.3691477 0.2775219 0.2211022 0.0896395 -0.0376096 0.2015132 0.3210966 0.0029002 -0.0085909 0.0251353 -0.1919687 -0.3042492 -0.2923675 -0.3123794 -0.2230670 NA NA NA NA NA NA NA
5670_IPR005670 Phosphate transport system permease protein 1 0.2865337 0.2803796 0.3594960 0.2672447 0.2129377 0.0845190 -0.0392771 0.1971035 0.3169354 0.0020579 -0.0085559 0.0256122 -0.1905221 -0.3013683 -0.2883569 -0.3060300 -0.2159636 NA NA NA NA NA NA NA
453_GO:0009058 biosynthetic process 0.3055940 0.2992792 0.3798436 0.2932097 0.2335416 0.0992650 -0.0319901 0.2051774 0.3196831 0.0049241 -0.0083599 0.0229932 -0.1884886 -0.3002220 -0.2916420 -0.3168287 -0.2327474 NA NA NA NA NA NA NA
2381_IPR002381 Ribonuclease PH, bacterial-type 0.2863774 0.2803739 0.3571878 0.2719482 0.2166352 0.0898765 -0.0333924 0.1940125 0.3059035 0.0036732 -0.0080980 0.0230380 -0.1817077 -0.2886520 -0.2787847 -0.3002019 -0.2172778 NA NA NA NA NA NA NA
11864_IPR011864 Phosphate ABC transporter, permease protein PstC 0.3299073 0.3232990 0.4070676 0.3234176 0.2575331 0.1148292 -0.0262166 0.2172686 0.3297108 0.0076093 -0.0083820 0.0211902 -0.1911267 -0.3062995 -0.3014877 -0.3340039 -0.2532966 NA NA NA NA NA NA NA
1062_IPR001062 Transcription antitermination protein, NusG 0.3125751 0.3060040 0.3901268 0.2962206 0.2359769 0.0974239 -0.0371791 0.2121331 0.3352430 0.0038075 -0.0088954 0.0254651 -0.1994176 -0.3166246 -0.3054631 -0.3283710 -0.2369755 NA NA NA NA NA NA NA
3764_IPR003764 N-acetylglucosamine-6-phosphate deacetylase 0.2661621 0.2602801 0.3363098 0.2427970 0.1935144 0.0724396 -0.0430713 0.1864418 0.3066015 0.0000957 -0.0084567 0.0266674 -0.1867618 -0.2940429 -0.2784323 -0.2906321 -0.1990007 NA NA NA NA NA NA NA
6130_GO:0006807 nitrogen compound metabolic process 0.2632171 0.2574362 0.3320720 0.2412965 0.1923059 0.0729596 -0.0411607 0.1836491 0.3005530 0.0004900 -0.0082523 0.0257461 -0.1825649 -0.2877194 -0.2730488 -0.2860317 -0.1971490 NA NA NA NA NA NA NA
5982_IPR005982 Thioredoxin reductase 0.2391295 0.2334546 0.3077498 0.2052868 0.1637539 0.0507625 -0.0542344 0.1754122 0.3042384 -0.0041977 -0.0087987 0.0307435 -0.1908763 -0.2974417 -0.2751030 -0.2761165 -0.1749945 NA NA NA NA NA NA NA
7016_NA 0.2539173 0.2477384 0.3289766 0.2129411 0.1699163 0.0482847 -0.0636829 0.1893609 0.3343402 -0.0061375 -0.0098138 0.0353060 -0.2117343 -0.3288829 -0.3019018 -0.2990761 -0.1843277 NA NA NA NA NA NA NA
5338_GO:0006807 nitrogen compound metabolic process 0.2474970 0.2415339 0.3198046 0.2095174 0.1671618 0.0492484 -0.0597023 0.1833668 0.3214963 -0.0053288 -0.0093825 0.0333782 -0.2028591 -0.3154926 -0.2904620 -0.2892253 -0.1802459 NA NA NA NA NA NA NA
114_GO:0006412 translation 0.3138185 0.3077303 0.3843772 0.3141625 0.2500992 0.1164916 -0.0170581 0.2026632 0.2990371 0.0094107 -0.0073642 0.0167159 -0.1700993 -0.2744923 -0.2741314 -0.3101067 -0.2428684 NA NA NA NA NA NA NA
394_IPR000394 RNA polymerase sigma factor 54 0.3027741 0.2968363 0.3717663 0.3010014 0.2396421 0.1100466 -0.0190025 0.1968256 0.2932259 0.0083779 -0.0073017 0.0172388 -0.1678933 -0.2702796 -0.2685699 -0.3016500 -0.2336995 NA NA NA NA NA NA NA
8141_GO:0006807 nitrogen compound metabolic process 0.3278833 0.3216942 0.3991402 0.3339003 0.2657576 0.1280169 -0.0109821 0.2082655 0.2997705 0.0117183 -0.0071656 0.0144781 -0.1675606 -0.2721505 -0.2754335 -0.3174005 -0.2554241 NA NA NA NA NA NA NA
4846_IPR004846 Type II/III secretion system 0.3075576 0.3018869 0.3724644 0.3176391 0.2527729 0.1250259 -0.0049360 0.1926248 0.2712504 0.0124711 -0.0063067 0.0112378 -0.1492026 -0.2437934 -0.2497437 -0.2925441 -0.2409006 NA NA NA NA NA NA NA
529_GO:0006412 translation 0.2817798 0.2761459 0.3475339 0.2765810 0.2202343 0.0984612 -0.0219760 0.1853613 0.2808412 0.0066140 -0.0071271 0.0179172 -0.1626265 -0.2607250 -0.2568378 -0.2848769 -0.2164468 NA NA NA NA NA NA NA
17714_GO:0009058 biosynthetic process 0.2943836 0.2894995 0.3487120 0.3219373 0.2560261 0.1396215 0.0169232 0.1733584 0.2195385 0.0179051 -0.0043634 0.0013038 -0.1106516 -0.1870071 -0.2042852 -0.2591113 -0.2358688 NA NA NA NA NA NA NA
4903_IPR004903 Lactobacillus surface layer protein 0.2586764 0.2539664 0.3124152 0.2691121 0.2141375 0.1073353 -0.0017859 0.1608065 0.2237584 0.0111412 -0.0051215 0.0084188 -0.1219749 -0.1999822 -0.2062525 -0.2437649 -0.2031912 NA NA NA NA NA NA NA
7812_IPR007812 Type II secretion system protein GspL 0.2060827 0.2023567 0.2485166 0.2152664 0.1712834 0.0864812 -0.0003712 0.1275765 0.1763166 0.0091659 -0.0039989 0.0062479 -0.0956130 -0.1570709 -0.1626290 -0.1931877 -0.1621355 NA NA NA NA NA NA NA
31723_GO:0016829 lyase activity 0.3181549 0.3133740 0.3697332 0.3643196 0.2895867 0.1691553 0.0381020 0.1772755 0.2005723 0.0248131 -0.0031845 -0.0072421 -0.0901533 -0.1596934 -0.1889675 -0.2609048 -0.2597538 NA NA NA NA NA NA NA
1971_GO:0006412 translation 0.3132595 0.3085376 0.3642541 0.3582318 0.2847518 0.1660154 0.0369329 0.1748444 0.1985655 0.0242706 -0.0031805 -0.0068761 -0.0896320 -0.1584839 -0.1869957 -0.2574530 -0.2556147 NA NA NA NA NA NA NA
456_GO:0006412 translation 0.3024839 0.2978715 0.3524818 0.3441702 0.2735892 0.1583669 0.0335599 0.1698999 0.1956294 0.0228561 -0.0032336 -0.0057215 -0.0896725 -0.1575337 -0.1839397 -0.2506272 -0.2463084 NA NA NA NA NA NA NA
4923_IPR004923 Iron permease FTR1/Fip1/EfeU 0.3065901 0.3021671 0.3536542 0.3571366 0.2838261 0.1697592 0.0440439 0.1671033 0.1797117 0.0259311 -0.0025024 -0.0101781 -0.0759914 -0.1382032 -0.1703335 -0.2443466 -0.2521014 NA NA NA NA NA NA NA
4792_NA 0.2858812 0.2818996 0.3277208 0.3377098 0.2683482 0.1635259 0.0467473 0.1529268 0.1570562 0.0257451 -0.0018945 -0.0119701 -0.0624227 -0.1167119 -0.1497101 -0.2223593 -0.2364598 NA NA NA NA NA NA NA
17665_GO:0006807 nitrogen compound metabolic process 0.3452918 0.3403813 0.3972790 0.4045557 0.3214920 0.1937924 0.0524292 0.1867594 0.1971640 0.0299835 -0.0025999 -0.0126967 -0.0813864 -0.1496000 -0.1872980 -0.2724629 -0.2846148 NA NA NA NA NA NA NA
4607_GO:0009058 biosynthetic process 0.3321917 0.3273820 0.3834353 0.3863864 0.3070765 0.1832967 0.0470294 0.1814095 0.1960007 0.0279056 -0.0027649 -0.0107257 -0.0833656 -0.1512259 -0.1856688 -0.2654191 -0.2729838 NA NA NA NA NA NA NA
218_GO:0006412 translation 0.3496593 0.3448321 0.4002195 0.4144588 0.3293224 0.2015767 0.0588791 0.1861772 0.1889406 0.0319581 -0.0021856 -0.0153841 -0.0738191 -0.1391041 -0.1803751 -0.2703219 -0.2896281 NA NA NA NA NA NA NA
16932_IPR016932 Uncharacterised conserved protein UCP029669 0.3532248 0.3487606 0.3983893 0.4322571 0.3433541 0.2187559 0.0758893 0.1797255 0.1604750 0.0368081 -0.0009396 -0.0227065 -0.0501932 -0.1053921 -0.1558642 -0.2572340 -0.2965893 NA NA NA NA NA NA NA
2975_IPR001019 Guanine nucleotide binding protein (G-protein), alpha subunit 0.3453459 0.3408109 0.3919455 0.4170072 0.3312850 0.2076272 0.0674159 0.1791669 0.1694537 0.0341177 -0.0014427 -0.0192392 -0.0591470 -0.1175573 -0.1632756 -0.2580432 -0.2883176 NA NA NA NA NA NA NA
4625_GO:0009058 biosynthetic process 0.3301929 0.3260877 0.3714390 0.4063069 0.3227236 0.2069825 0.0736433 0.1666315 0.1450064 0.0351531 -0.0006694 -0.0224059 -0.0429058 -0.0927350 -0.1413619 -0.2378520 -0.2779103 NA NA NA NA NA NA NA
15815_GO:0016491 oxidoreductase activity 0.4051042 0.3999372 0.4575764 0.4941961 0.3925663 0.2491603 0.0851637 0.2070749 0.1875111 0.0416982 -0.0012222 -0.0252241 -0.0603459 -0.1248785 -0.1817621 -0.2968221 -0.3396933 NA NA NA NA NA NA NA
92_GO:0009058 biosynthetic process 0.3799115 0.3750238 0.4297255 0.4620741 0.3670611 0.2321179 0.0781882 0.1950518 0.1789604 0.0386421 -0.0012760 -0.0229222 -0.0590879 -0.1207076 -0.1731552 -0.2799848 -0.3181583 NA NA NA NA NA NA NA
577_GO:0005975 carbohydrate metabolic process 0.3668820 0.3630482 0.4022764 0.4754107 0.3774238 0.2566775 0.1107928 0.1704070 0.1074703 0.0470450 0.0014950 -0.0375440 -0.0046401 -0.0410270 -0.1105559 -0.2363121 -0.3158646 NA NA NA NA NA NA NA
14434_IPR014434 Monothiol glutaredoxin 0.3623264 0.3581425 0.4029847 0.4564130 0.3624392 0.2388988 0.0935846 0.1763473 0.1354587 0.0420959 0.0002528 -0.0301644 -0.0281034 -0.0744119 -0.1346062 -0.2486648 -0.3080757 NA NA NA NA NA NA NA
7325_GO:0006807 nitrogen compound metabolic process 0.3207744 0.3173008 0.3534642 0.4116606 0.3268430 0.2199589 0.0920288 0.1514541 0.1029285 0.0397983 0.0009330 -0.0307122 -0.0112476 -0.0462329 -0.1044341 -0.2112872 -0.2749864 NA NA NA NA NA NA NA
11537_IPR011537 NADH ubiquinone oxidoreductase, F subunit 0.3041458 0.3010939 0.3316753 0.3982774 0.3161575 0.2174231 0.0968787 0.1387074 0.0797747 0.0403875 0.0016282 -0.0333210 0.0036279 -0.0232403 -0.0835720 -0.1910451 -0.2630811 NA NA NA NA NA NA NA
28268_IPR028268 Pianissimo family -0.2486329 -0.2460016 -0.2730953 -0.3210893 -0.2549177 -0.1727308 -0.0737624 -0.1161556 -0.0752783 -0.0315178 -0.0009111 0.0248664 0.0051071 0.0306316 0.0770440 0.1614223 0.2137362 NA NA NA NA NA NA NA
16484_IPR016484 GTP-binding protein EngA -0.2516543 -0.2499826 -0.2621908 -0.3576461 -0.2836967 -0.2112181 -0.1141417 -0.0974762 -0.0030678 -0.0427861 -0.0039737 0.0424091 -0.0534871 -0.0535217 0.0145804 0.1252613 0.2259765 NA NA NA NA NA NA NA
5815_GO:0016740 transferase activity 0.0052099 0.0015722 0.0571048 -0.1112409 -0.0874344 -0.1278369 -0.1410904 0.0750142 0.2657496 -0.0386637 -0.0110052 0.0617619 -0.2120082 -0.3059985 -0.2306526 -0.1411039 0.0303574 NA NA NA NA NA NA NA
2176_IPR002176 Crossover junction endodeoxyribonuclease RuvC -0.0230526 -0.0261403 0.0224636 -0.1394791 -0.1099149 -0.1382666 -0.1394873 0.0567283 0.2386945 -0.0394928 -0.0103368 0.0602273 -0.1965898 -0.2811346 -0.2058569 -0.1131110 0.0522139 NA NA NA NA NA NA NA
5990_GO:0009058 biosynthetic process -0.0351494 -0.0381614 0.0099265 -0.1568227 -0.1236714 -0.1485889 -0.1451577 0.0521362 0.2388872 -0.0416001 -0.0105420 0.0623460 -0.1994337 -0.2841005 -0.2054509 -0.1072672 0.0631212 NA NA NA NA NA NA NA
26856_GO:0016787 hydrolase activity -0.1227731 -0.1234412 -0.1066339 -0.2233385 -0.1768278 -0.1574872 -0.1147571 -0.0174966 0.1079079 -0.0371595 -0.0065042 0.0464838 -0.1138515 -0.1525721 -0.0877409 0.0040744 0.1246729 NA NA NA NA NA NA NA
7318_NA -0.0845598 -0.0864456 -0.0529983 -0.2007658 -0.1587069 -0.1607776 -0.1357958 0.0168300 0.1794395 -0.0412412 -0.0088665 0.0567990 -0.1627337 -0.2265898 -0.1515691 -0.0519951 0.0997301 NA NA NA NA NA NA NA
30970_IPR030970 Probable phospholipid ABC transporter-binding protein MlaD -0.1622414 -0.1631436 -0.1406379 -0.2957697 -0.2341717 -0.2088215 -0.1524148 -0.0227314 0.1440168 -0.0493166 -0.0086544 0.0617617 -0.1515901 -0.2032605 -0.1171778 0.0046443 0.1649391 NA NA NA NA NA NA NA
# I want to save this as a csv:
write.csv(MINT_sPLS_mat.corr_and_LOGOCV_GOs, file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/output_tables/MINT_sPLS_sim_and LOGOCV_stab_scores_GOs.csv", quote = F, row.names = F)

Do microbial taxa or genes/functions have a higher utility as indicators?

Bray-Curtis similarity

Bray-Curtis similarity within replicates - is Function always more stable regardless of the hierarchical level?

# Taxa - Genus level
GENUS <- left_join(otu_table(megan_genus_abundant_known_phyla_only) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU"),
                                  tax_table(megan_genus_abundant_known_phyla_only) %>%
                                    as.data.frame() %>% 
                                    rownames_to_column("OTU")) %>% 
  column_to_rownames("OTU")
## Joining with `by = join_by(OTU)`
# Now summarising raw counts at phylum level
GENUS <- ddply(GENUS, "Genus", numcolwise(sum)) %>% 
  column_to_rownames("Genus")

# Ready to compute raw abundances per sample - by dividing cell value with column sum
GENUS_RA <- GENUS
for (i in 1:(ncol(GENUS))) { # '2:' as the first column is not numeric
  GENUS_RA[i] <- GENUS_RA[i] / sum(GENUS_RA[i]) 
}
# Checking that rel abunds sum up to 1:
colSums(GENUS_RA)
##          11-049-1_S89_R1          11-049-2_S90_R1          11-049-3_S91_R1 
##                        1                        1                        1 
##          11-049-4_S92_R1          11-162-1_S81_R1          11-162-2_S82_R1 
##                        1                        1                        1 
##          11-162-3_S83_R1          11-162-4_S84_R1           13-124-1_S9_R1 
##                        1                        1                        1 
##          13-124-2_S10_R1          13-124-3_S11_R1          13-124-4_S12_R1 
##                        1                        1                        1 
##          21-550-1_S69_R1          21-550-2_S70_R1          21-550-3_S71_R1 
##                        1                        1                        1 
##          21-550-4_S72_R1          21-580-1_S57_R1          21-580-2_S58_R1 
##                        1                        1                        1 
##          21-580-3_S59_R1          21-580-4_S60_R1          22-084-1_S41_R1 
##                        1                        1                        1 
##          22-084-2_S42_R1          22-084-3_S43_R1          22-084-4_S44_R1 
##                        1                        1                        1 
##      Agincourt1-1_S33_R1      Agincourt1-2_S34_R1      Agincourt1-3_S35_R1 
##                        1                        1                        1 
##      Agincourt1-4_S36_R1       Arlington-1_S37_R1       Arlington-2_S38_R1 
##                        1                        1                        1 
##       Arlington-3_S39_R1       Arlington-4_S40_R1           Boult-1_S25_R1 
##                        1                        1                        1 
##           Boult-2_S26_R1           Boult-3_S27_R1           Boult-4_S28_R1 
##                        1                        1                        1 
##      Broomfield-1_S49_R1      Broomfield-3_S51_R1      Broomfield-4_S52_R1 
##                        1                        1                        1 
## Broomfield-rpt-2_S115_R1       Centipede-1_S57_R1       Centipede-2_S58_R1 
##                        1                        1                        1 
##       Centipede-3_S59_R1       Centipede-4_S60_R1         Chicken-1_S69_R1 
##                        1                        1                        1 
##         Chicken-2_S70_R1         Chicken-3_S71_R1         Chicken-4_S72_R1 
##                        1                        1                        1 
##        Chinaman-1_S65_R1        Chinaman-2_S66_R1        Chinaman-3_S67_R1 
##                        1                        1                        1 
##        Chinaman-4_S68_R1         Corbett-1_S17_R1         Corbett-2_S18_R1 
##                        1                        1                        1 
##         Corbett-3_S19_R1         Corbett-4_S20_R1            Davie-1_S1_R1 
##                        1                        1                        1 
##            Davie-2_S2_R1            Davie-3_S3_R1            Davie-4_S4_R1 
##                        1                        1                        1 
##         Erskine-1_S61_R1         Erskine-2_S62_R1         Erskine-3_S63_R1 
##                        1                        1                        1 
##         Erskine-4_S64_R1         Fairfax-1_S33_R1         Fairfax-2_S34_R1 
##                        1                        1                        1 
##         Fairfax-3_S35_R1         Fairfax-4_S36_R1     Farquaharson-1_S1_R1 
##                        1                        1                        1 
##     Farquaharson-2_S2_R1     Farquaharson-3_S3_R1     Farquaharson-4_S4_R1 
##                        1                        1                        1 
##          Feather-1_S5_R1          Feather-2_S6_R1          Feather-3_S7_R1 
##                        1                        1                        1 
##          Feather-4_S8_R1    Fore-and-Aft-1_S77_R1    Fore-and-Aft-2_S78_R1 
##                        1                        1                        1 
##    Fore-and-Aft-3_S79_R1    Fore-and-Aft-4_S80_R1            Fork-1_S49_R1 
##                        1                        1                        1 
##            Fork-2_S50_R1            Fork-3_S51_R1            Fork-4_S52_R1 
##                        1                        1                        1 
##            Grub-1_S65_R1            Grub-2_S66_R1            Grub-3_S67_R1 
##                        1                        1                        1 
##            Grub-4_S68_R1        Hastings-1_S41_R1        Hastings-2_S42_R1 
##                        1                        1                        1 
##        Hastings-3_S43_R1        Hastings-4_S44_R1          Hedley-1_S21_R1 
##                        1                        1                        1 
##          Hedley-2_S22_R1          Hedley-3_S23_R1           Helix-1_S61_R1 
##                        1                        1                        1 
##           Helix-2_S62_R1           Helix-3_S63_R1           Helix-4_S64_R1 
##                        1                        1                        1 
##          Hoskyn-1_S29_R1          Hoskyn-2_S30_R1          Hoskyn-3_S31_R1 
##                        1                        1                        1 
##          Hoskyn-4_S32_R1      JohnBrewer-1_S93_R1      JohnBrewer-2_S94_R1 
##                        1                        1                        1 
##      JohnBrewer-3_S97_R1      JohnBrewer-4_S98_R1           Kelso-1_S85_R1 
##                        1                        1                        1 
##           Kelso-2_S86_R1           Kelso-3_S87_R1           Kelso-4_S88_R1 
##                        1                        1                        1 
##           Knife-1_S45_R1           Knife-2_S46_R1           Knife-3_S47_R1 
##                        1                        1                        1 
##           Knife-4_S48_R1          Lagoon-1_S13_R1          Lagoon-2_S14_R1 
##                        1                        1                        1 
##          Lagoon-3_S15_R1          Lagoon-4_S16_R1     LittleKelso-1_S81_R1 
##                        1                        1                        1 
##     LittleKelso-2_S82_R1     LittleKelso-3_S83_R1     LittleKelso-4_S84_R1 
##                        1                        1                        1 
##          Lynchs-1_S99_R1         Lynchs-2_S100_R1         Lynchs-3_S101_R1 
##                        1                        1                        1 
##         Lynchs-4_S102_R1          Mantis-1_S85_R1          Mantis-2_S86_R1 
##                        1                        1                        1 
##          Mantis-3_S87_R1          Mantis-4_S88_R1        Masthead-1_S53_R1 
##                        1                        1                        1 
##        Masthead-2_S54_R1        Masthead-3_S55_R1        Masthead-4_S56_R1 
##                        1                        1                        1 
##       McCulloch-1_S17_R1       McCulloch-2_S18_R1       McCulloch-3_S19_R1 
##                        1                        1                        1 
##       McCulloch-4_S20_R1        McSweeney-1_S5_R1        McSweeney-2_S6_R1 
##                        1                        1                        1 
##        McSweeney-3_S7_R1        McSweeney-4_S8_R1         Monsoon-1_S21_R1 
##                        1                        1                        1 
##         Monsoon-2_S22_R1         Monsoon-3_S23_R1         Monsoon-4_S24_R1 
##                        1                        1                        1 
##           Moore-1_S25_R1           Moore-2_S26_R1           Moore-3_S27_R1 
##                        1                        1                        1 
##           Moore-4_S28_R1        Myrmidon-1_S53_R1        Myrmidon-2_S54_R1 
##                        1                        1                        1 
##        Myrmidon-3_S55_R1        Myrmidon-4_S56_R1           North-1_S37_R1 
##                        1                        1                        1 
##           North-2_S38_R1           North-3_S39_R1           North-4_S40_R1 
##                        1                        1                        1 
##           Peart-1_S13_R1           Peart-2_S14_R1           Peart-3_S15_R1 
##                        1                        1                        1 
##           Peart-4_S16_R1             Rib-1_S73_R1             Rib-2_S74_R1 
##                        1                        1                        1 
##             Rib-3_S75_R1             Rib-4_S76_R1        Roxburgh-1_S89_R1 
##                        1                        1                        1 
##        Roxburgh-2_S90_R1        Roxburgh-3_S91_R1        Roxburgh-4_S92_R1 
##                        1                        1                        1 
##        Sanbank1-1_S77_R1        Sanbank1-2_S78_R1        Sanbank1-3_S79_R1 
##                        1                        1                        1 
##        Sanbank1-4_S80_R1     SmallLagoon-1_S45_R1     SmallLagoon-2_S46_R1 
##                        1                        1                        1 
##     SmallLagoon-3_S47_R1     SmallLagoon-4_S48_R1      St-Crispin-1_S73_R1 
##                        1                        1                        1 
##      St-Crispin-2_S74_R1      St-Crispin-3_S75_R1      St-Crispin-4_S76_R1 
##                        1                        1                        1 
##           Taylor-1_S9_R1          Taylor-2_S10_R1          Taylor-3_S11_R1 
##                        1                        1                        1 
##          Taylor-4_S12_R1        Thetford-1_S29_R1        Thetford-2_S30_R1 
##                        1                        1                        1 
##        Thetford-3_S31_R1        Thetford-4_S32_R1 
##                        1                        1
bray_curtis_genus <- vegdist(t(GENUS_RA), # needs transposing
                            method = "bray", # I am computing Bray Curtis dissimilarity
                            diag = F, # diagonals will be 0 as it's on the same samples
                            upper = TRUE) %>% # Only the upper bit of the matrix
  as.matrix() %>%  # Output as matrix
  reshape2::melt() %>% # Getting it in long format to have pairwise comparisons - this is needed for visualisation
  left_join(., data.frame(sample_data(megan_genus_clr)) %>% 
              rownames_to_column("Var1")) %>% # adding reef names for first pair in the BC sim matrix
  dplyr::select(c("Var1", "Var2", "value", "REEF_NAME")) %>% # Selecting only columns of interest
  dplyr::rename(., REEF_NAME_for_Var1 = REEF_NAME) %>% # Rename to know that reef names correspond to first samples in the BC sim matrix
  left_join(., data.frame(sample_data(megan_genus_clr)) %>% 
              rownames_to_column("Var2")) %>% # Now merging based on Var2
  dplyr::select(c("Var1", "Var2", "value", "REEF_NAME_for_Var1", "REEF_NAME")) %>% # Selecting only columns of interest
  dplyr::rename(., REEF_NAME_for_Var2 = REEF_NAME) # Rename to know that reef names correspond to second samples in the BC sim matrix
## Joining with `by = join_by(Var1)`
## Joining with `by = join_by(Var2)`
bray_curtis_genus <- dplyr::filter(bray_curtis_genus, REEF_NAME_for_Var1==REEF_NAME_for_Var2) %>% # Only selecting pairs where the reef names are the same, that way I only pull out dissimilarity values from the same site - Within replicate level! 
  dplyr::filter(value != 0) %>%  # Removing values that have a zero - those are comparisons between the same replicate! We don't need those as dissimilarity will be zero (the same replicates are completely identical)
  mutate(Bray_Curtis_similarity = 1-value) %>%  # computing Bray Curtis similarity as 1 - BC dissimilarity
  left_join(., data.frame(sample_data(megan_genus_clr))[,c(1,2)] %>% # Only getting columns Reef name and Sampling trip
              rownames_to_column("Var1"))
## Joining with `by = join_by(Var1)`
bray_curtis_genus_median <- round(median(bray_curtis_genus$Bray_Curtis_similarity), digits = 2)
bray_curtis_genus_mean <- round(mean(bray_curtis_genus$Bray_Curtis_similarity), digits = 2)
bray_curtis_genus_SD <- round(sd(bray_curtis_genus$Bray_Curtis_similarity), digits = 2)
bray_curtis_genus_minimum <- round(min(bray_curtis_genus$Bray_Curtis_similarity), digits = 2)

# Plotting as bocplots:
bray_curtis_genus_boxplots <- bray_curtis_genus  %>%
  ggplot(aes(x = "Within replicate for taxa (Genus level)",
             y = Bray_Curtis_similarity,
#             col = Sampling_trip
             )) +
  geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
    geom_jitter(aes(color=Sampling_trip), size=0.4, alpha=0.2) +
    stat_summary(aes(label = paste("Median:", bray_curtis_genus_median,
                                   "\nMean:", bray_curtis_genus_mean,
                                   "\nSD:", bray_curtis_genus_SD,
                                   "\nMin:", bray_curtis_genus_minimum)),
  fun.y = median,
  geom = "text",
  color = "black"
) + # median and SD as text  
    stat_summary(fun=mean,
               geom="point",
               shape=20,
               size=1.5,
               color="seagreen1",
               fill="seagreen1") + # Plotting the mean as a green dot! 
  scale_color_manual(values = c("indianred", # Sampling trip 1
                "indianred4", # Sampling trip 2 
                "red3", # Sampling trip 3
                "slateblue") # Sampling trip 4
    ) +
  ylim(0, 1) + # Fixing the y axis to show min and max Bray Curtis dissimilarity
  ylab("Bray-Curtis similarity") +
  theme(legend.position = "none")
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Taxa - Family level
FAMILY <- left_join(otu_table(megan_genus_abundant_known_phyla_only) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU"),
                                  tax_table(megan_genus_abundant_known_phyla_only) %>%
                                    as.data.frame() %>% 
                                    rownames_to_column("OTU")) %>% 
  column_to_rownames("OTU")
## Joining with `by = join_by(OTU)`
# Now summarising raw counts at phylum level
FAMILY <- ddply(FAMILY, "Family", numcolwise(sum)) %>% 
  column_to_rownames("Family")

# Ready to compute raw abundances per sample - by dividing cell value with column sum
FAMILY_RA <- FAMILY
for (i in 1:(ncol(FAMILY))) { # '2:' as the first column is not numeric
  FAMILY_RA[i] <- FAMILY_RA[i] / sum(FAMILY_RA[i]) 
}
# Checking that rel abunds sum up to 1:
colSums(FAMILY_RA)
##          11-049-1_S89_R1          11-049-2_S90_R1          11-049-3_S91_R1 
##                        1                        1                        1 
##          11-049-4_S92_R1          11-162-1_S81_R1          11-162-2_S82_R1 
##                        1                        1                        1 
##          11-162-3_S83_R1          11-162-4_S84_R1           13-124-1_S9_R1 
##                        1                        1                        1 
##          13-124-2_S10_R1          13-124-3_S11_R1          13-124-4_S12_R1 
##                        1                        1                        1 
##          21-550-1_S69_R1          21-550-2_S70_R1          21-550-3_S71_R1 
##                        1                        1                        1 
##          21-550-4_S72_R1          21-580-1_S57_R1          21-580-2_S58_R1 
##                        1                        1                        1 
##          21-580-3_S59_R1          21-580-4_S60_R1          22-084-1_S41_R1 
##                        1                        1                        1 
##          22-084-2_S42_R1          22-084-3_S43_R1          22-084-4_S44_R1 
##                        1                        1                        1 
##      Agincourt1-1_S33_R1      Agincourt1-2_S34_R1      Agincourt1-3_S35_R1 
##                        1                        1                        1 
##      Agincourt1-4_S36_R1       Arlington-1_S37_R1       Arlington-2_S38_R1 
##                        1                        1                        1 
##       Arlington-3_S39_R1       Arlington-4_S40_R1           Boult-1_S25_R1 
##                        1                        1                        1 
##           Boult-2_S26_R1           Boult-3_S27_R1           Boult-4_S28_R1 
##                        1                        1                        1 
##      Broomfield-1_S49_R1      Broomfield-3_S51_R1      Broomfield-4_S52_R1 
##                        1                        1                        1 
## Broomfield-rpt-2_S115_R1       Centipede-1_S57_R1       Centipede-2_S58_R1 
##                        1                        1                        1 
##       Centipede-3_S59_R1       Centipede-4_S60_R1         Chicken-1_S69_R1 
##                        1                        1                        1 
##         Chicken-2_S70_R1         Chicken-3_S71_R1         Chicken-4_S72_R1 
##                        1                        1                        1 
##        Chinaman-1_S65_R1        Chinaman-2_S66_R1        Chinaman-3_S67_R1 
##                        1                        1                        1 
##        Chinaman-4_S68_R1         Corbett-1_S17_R1         Corbett-2_S18_R1 
##                        1                        1                        1 
##         Corbett-3_S19_R1         Corbett-4_S20_R1            Davie-1_S1_R1 
##                        1                        1                        1 
##            Davie-2_S2_R1            Davie-3_S3_R1            Davie-4_S4_R1 
##                        1                        1                        1 
##         Erskine-1_S61_R1         Erskine-2_S62_R1         Erskine-3_S63_R1 
##                        1                        1                        1 
##         Erskine-4_S64_R1         Fairfax-1_S33_R1         Fairfax-2_S34_R1 
##                        1                        1                        1 
##         Fairfax-3_S35_R1         Fairfax-4_S36_R1     Farquaharson-1_S1_R1 
##                        1                        1                        1 
##     Farquaharson-2_S2_R1     Farquaharson-3_S3_R1     Farquaharson-4_S4_R1 
##                        1                        1                        1 
##          Feather-1_S5_R1          Feather-2_S6_R1          Feather-3_S7_R1 
##                        1                        1                        1 
##          Feather-4_S8_R1    Fore-and-Aft-1_S77_R1    Fore-and-Aft-2_S78_R1 
##                        1                        1                        1 
##    Fore-and-Aft-3_S79_R1    Fore-and-Aft-4_S80_R1            Fork-1_S49_R1 
##                        1                        1                        1 
##            Fork-2_S50_R1            Fork-3_S51_R1            Fork-4_S52_R1 
##                        1                        1                        1 
##            Grub-1_S65_R1            Grub-2_S66_R1            Grub-3_S67_R1 
##                        1                        1                        1 
##            Grub-4_S68_R1        Hastings-1_S41_R1        Hastings-2_S42_R1 
##                        1                        1                        1 
##        Hastings-3_S43_R1        Hastings-4_S44_R1          Hedley-1_S21_R1 
##                        1                        1                        1 
##          Hedley-2_S22_R1          Hedley-3_S23_R1           Helix-1_S61_R1 
##                        1                        1                        1 
##           Helix-2_S62_R1           Helix-3_S63_R1           Helix-4_S64_R1 
##                        1                        1                        1 
##          Hoskyn-1_S29_R1          Hoskyn-2_S30_R1          Hoskyn-3_S31_R1 
##                        1                        1                        1 
##          Hoskyn-4_S32_R1      JohnBrewer-1_S93_R1      JohnBrewer-2_S94_R1 
##                        1                        1                        1 
##      JohnBrewer-3_S97_R1      JohnBrewer-4_S98_R1           Kelso-1_S85_R1 
##                        1                        1                        1 
##           Kelso-2_S86_R1           Kelso-3_S87_R1           Kelso-4_S88_R1 
##                        1                        1                        1 
##           Knife-1_S45_R1           Knife-2_S46_R1           Knife-3_S47_R1 
##                        1                        1                        1 
##           Knife-4_S48_R1          Lagoon-1_S13_R1          Lagoon-2_S14_R1 
##                        1                        1                        1 
##          Lagoon-3_S15_R1          Lagoon-4_S16_R1     LittleKelso-1_S81_R1 
##                        1                        1                        1 
##     LittleKelso-2_S82_R1     LittleKelso-3_S83_R1     LittleKelso-4_S84_R1 
##                        1                        1                        1 
##          Lynchs-1_S99_R1         Lynchs-2_S100_R1         Lynchs-3_S101_R1 
##                        1                        1                        1 
##         Lynchs-4_S102_R1          Mantis-1_S85_R1          Mantis-2_S86_R1 
##                        1                        1                        1 
##          Mantis-3_S87_R1          Mantis-4_S88_R1        Masthead-1_S53_R1 
##                        1                        1                        1 
##        Masthead-2_S54_R1        Masthead-3_S55_R1        Masthead-4_S56_R1 
##                        1                        1                        1 
##       McCulloch-1_S17_R1       McCulloch-2_S18_R1       McCulloch-3_S19_R1 
##                        1                        1                        1 
##       McCulloch-4_S20_R1        McSweeney-1_S5_R1        McSweeney-2_S6_R1 
##                        1                        1                        1 
##        McSweeney-3_S7_R1        McSweeney-4_S8_R1         Monsoon-1_S21_R1 
##                        1                        1                        1 
##         Monsoon-2_S22_R1         Monsoon-3_S23_R1         Monsoon-4_S24_R1 
##                        1                        1                        1 
##           Moore-1_S25_R1           Moore-2_S26_R1           Moore-3_S27_R1 
##                        1                        1                        1 
##           Moore-4_S28_R1        Myrmidon-1_S53_R1        Myrmidon-2_S54_R1 
##                        1                        1                        1 
##        Myrmidon-3_S55_R1        Myrmidon-4_S56_R1           North-1_S37_R1 
##                        1                        1                        1 
##           North-2_S38_R1           North-3_S39_R1           North-4_S40_R1 
##                        1                        1                        1 
##           Peart-1_S13_R1           Peart-2_S14_R1           Peart-3_S15_R1 
##                        1                        1                        1 
##           Peart-4_S16_R1             Rib-1_S73_R1             Rib-2_S74_R1 
##                        1                        1                        1 
##             Rib-3_S75_R1             Rib-4_S76_R1        Roxburgh-1_S89_R1 
##                        1                        1                        1 
##        Roxburgh-2_S90_R1        Roxburgh-3_S91_R1        Roxburgh-4_S92_R1 
##                        1                        1                        1 
##        Sanbank1-1_S77_R1        Sanbank1-2_S78_R1        Sanbank1-3_S79_R1 
##                        1                        1                        1 
##        Sanbank1-4_S80_R1     SmallLagoon-1_S45_R1     SmallLagoon-2_S46_R1 
##                        1                        1                        1 
##     SmallLagoon-3_S47_R1     SmallLagoon-4_S48_R1      St-Crispin-1_S73_R1 
##                        1                        1                        1 
##      St-Crispin-2_S74_R1      St-Crispin-3_S75_R1      St-Crispin-4_S76_R1 
##                        1                        1                        1 
##           Taylor-1_S9_R1          Taylor-2_S10_R1          Taylor-3_S11_R1 
##                        1                        1                        1 
##          Taylor-4_S12_R1        Thetford-1_S29_R1        Thetford-2_S30_R1 
##                        1                        1                        1 
##        Thetford-3_S31_R1        Thetford-4_S32_R1 
##                        1                        1
bray_curtis_family <- vegdist(t(FAMILY_RA), # needs transposing
                            method = "bray", # I am computing Bray Curtis dissimilarity
                            diag = F, # diagonals will be 0 as it's on the same samples
                            upper = TRUE) %>% # Only the upper bit of the matrix
  as.matrix() %>%  # Output as matrix
  reshape2::melt() %>% # Getting it in long format to have pairwise comparisons - this is needed for visualisation
  left_join(., data.frame(sample_data(megan_genus_clr)) %>% 
              rownames_to_column("Var1")) %>% # adding reef names for first pair in the BC sim matrix
  dplyr::select(c("Var1", "Var2", "value", "REEF_NAME")) %>% # Selecting only columns of interest
  dplyr::rename(., REEF_NAME_for_Var1 = REEF_NAME) %>% # Rename to know that reef names correspond to first samples in the BC sim matrix
  left_join(., data.frame(sample_data(megan_genus_clr)) %>% 
              rownames_to_column("Var2")) %>% # Now merging based on Var2
  dplyr::select(c("Var1", "Var2", "value", "REEF_NAME_for_Var1", "REEF_NAME")) %>% # Selecting only columns of interest
  dplyr::rename(., REEF_NAME_for_Var2 = REEF_NAME) # Rename to know that reef names correspond to second samples in the BC sim matrix
## Joining with `by = join_by(Var1)`
## Joining with `by = join_by(Var2)`
bray_curtis_family <- dplyr::filter(bray_curtis_family, REEF_NAME_for_Var1==REEF_NAME_for_Var2) %>% # Only selecting pairs where the reef names are the same, that way I only pull out dissimilarity values from the same site - Within replicate level! 
  dplyr::filter(value != 0) %>%  # Removing values that have a zero - those are comparisons between the same replicate! We don't need those as dissimilarity will be zero (the same replicates are completely identical)
  mutate(Bray_Curtis_similarity = 1-value) %>%  # computing Bray Curtis similarity as 1 - BC dissimilarity
  left_join(., data.frame(sample_data(megan_genus_clr))[,c(1,2)] %>% # Only getting columns Reef name and Sampling trip
              rownames_to_column("Var1"))
## Joining with `by = join_by(Var1)`
bray_curtis_family_median <- round(median(bray_curtis_family$Bray_Curtis_similarity), digits = 2)
bray_curtis_family_mean <- round(mean(bray_curtis_family$Bray_Curtis_similarity), digits = 2)
bray_curtis_family_SD <- round(sd(bray_curtis_family$Bray_Curtis_similarity), digits = 2)
bray_curtis_family_minimum <- round(min(bray_curtis_family$Bray_Curtis_similarity), digits = 2)

# Plotting as bocplots:
bray_curtis_family_boxplots <- bray_curtis_family  %>%
  ggplot(aes(x = "Within replicate for taxa (family level)",
             y = Bray_Curtis_similarity,
#             col = Sampling_trip
             )) +
  geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
    geom_jitter(aes(color=Sampling_trip), size=0.4, alpha=0.2) +
    stat_summary(aes(label = paste("Median:", bray_curtis_family_median,
                                   "\nMean:", bray_curtis_family_mean,
                                   "\nSD:", bray_curtis_family_SD,
                                   "\nMin:", bray_curtis_family_minimum)),
  fun.y = median,
  geom = "text",
  color = "black"
) + # median and SD as text  
      stat_summary(fun=mean,
               geom="point",
               shape=20,
               size=1.5,
               color="seagreen1",
               fill="seagreen1") + # Plotting the mean as a green dot! 
    scale_color_manual(values = c("indianred", # Sampling trip 1
                "indianred4", # Sampling trip 2 
                "red3", # Sampling trip 3
                "slateblue") # Sampling trip 4
    ) +
  ylim(0, 1) + # Fixing the y axis to show min and max Bray Curtis dissimilarity
  ylab("Bray-Curtis similarity") +
  theme(legend.position = "none")
# Taxa - Order level
ORDER <- left_join(otu_table(megan_genus_abundant_known_phyla_only) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU"),
                                  tax_table(megan_genus_abundant_known_phyla_only) %>%
                                    as.data.frame() %>% 
                                    rownames_to_column("OTU")) %>% 
  column_to_rownames("OTU")
## Joining with `by = join_by(OTU)`
# Now summarising raw counts at phylum level
ORDER <- ddply(ORDER, "Order", numcolwise(sum)) %>% 
  column_to_rownames("Order")

# Ready to compute raw abundances per sample - by dividing cell value with column sum
ORDER_RA <- ORDER
for (i in 1:(ncol(ORDER))) { # '2:' as the first column is not numeric
  ORDER_RA[i] <- ORDER_RA[i] / sum(ORDER_RA[i]) 
}
# Checking that rel abunds sum up to 1:
colSums(ORDER_RA)
##          11-049-1_S89_R1          11-049-2_S90_R1          11-049-3_S91_R1 
##                        1                        1                        1 
##          11-049-4_S92_R1          11-162-1_S81_R1          11-162-2_S82_R1 
##                        1                        1                        1 
##          11-162-3_S83_R1          11-162-4_S84_R1           13-124-1_S9_R1 
##                        1                        1                        1 
##          13-124-2_S10_R1          13-124-3_S11_R1          13-124-4_S12_R1 
##                        1                        1                        1 
##          21-550-1_S69_R1          21-550-2_S70_R1          21-550-3_S71_R1 
##                        1                        1                        1 
##          21-550-4_S72_R1          21-580-1_S57_R1          21-580-2_S58_R1 
##                        1                        1                        1 
##          21-580-3_S59_R1          21-580-4_S60_R1          22-084-1_S41_R1 
##                        1                        1                        1 
##          22-084-2_S42_R1          22-084-3_S43_R1          22-084-4_S44_R1 
##                        1                        1                        1 
##      Agincourt1-1_S33_R1      Agincourt1-2_S34_R1      Agincourt1-3_S35_R1 
##                        1                        1                        1 
##      Agincourt1-4_S36_R1       Arlington-1_S37_R1       Arlington-2_S38_R1 
##                        1                        1                        1 
##       Arlington-3_S39_R1       Arlington-4_S40_R1           Boult-1_S25_R1 
##                        1                        1                        1 
##           Boult-2_S26_R1           Boult-3_S27_R1           Boult-4_S28_R1 
##                        1                        1                        1 
##      Broomfield-1_S49_R1      Broomfield-3_S51_R1      Broomfield-4_S52_R1 
##                        1                        1                        1 
## Broomfield-rpt-2_S115_R1       Centipede-1_S57_R1       Centipede-2_S58_R1 
##                        1                        1                        1 
##       Centipede-3_S59_R1       Centipede-4_S60_R1         Chicken-1_S69_R1 
##                        1                        1                        1 
##         Chicken-2_S70_R1         Chicken-3_S71_R1         Chicken-4_S72_R1 
##                        1                        1                        1 
##        Chinaman-1_S65_R1        Chinaman-2_S66_R1        Chinaman-3_S67_R1 
##                        1                        1                        1 
##        Chinaman-4_S68_R1         Corbett-1_S17_R1         Corbett-2_S18_R1 
##                        1                        1                        1 
##         Corbett-3_S19_R1         Corbett-4_S20_R1            Davie-1_S1_R1 
##                        1                        1                        1 
##            Davie-2_S2_R1            Davie-3_S3_R1            Davie-4_S4_R1 
##                        1                        1                        1 
##         Erskine-1_S61_R1         Erskine-2_S62_R1         Erskine-3_S63_R1 
##                        1                        1                        1 
##         Erskine-4_S64_R1         Fairfax-1_S33_R1         Fairfax-2_S34_R1 
##                        1                        1                        1 
##         Fairfax-3_S35_R1         Fairfax-4_S36_R1     Farquaharson-1_S1_R1 
##                        1                        1                        1 
##     Farquaharson-2_S2_R1     Farquaharson-3_S3_R1     Farquaharson-4_S4_R1 
##                        1                        1                        1 
##          Feather-1_S5_R1          Feather-2_S6_R1          Feather-3_S7_R1 
##                        1                        1                        1 
##          Feather-4_S8_R1    Fore-and-Aft-1_S77_R1    Fore-and-Aft-2_S78_R1 
##                        1                        1                        1 
##    Fore-and-Aft-3_S79_R1    Fore-and-Aft-4_S80_R1            Fork-1_S49_R1 
##                        1                        1                        1 
##            Fork-2_S50_R1            Fork-3_S51_R1            Fork-4_S52_R1 
##                        1                        1                        1 
##            Grub-1_S65_R1            Grub-2_S66_R1            Grub-3_S67_R1 
##                        1                        1                        1 
##            Grub-4_S68_R1        Hastings-1_S41_R1        Hastings-2_S42_R1 
##                        1                        1                        1 
##        Hastings-3_S43_R1        Hastings-4_S44_R1          Hedley-1_S21_R1 
##                        1                        1                        1 
##          Hedley-2_S22_R1          Hedley-3_S23_R1           Helix-1_S61_R1 
##                        1                        1                        1 
##           Helix-2_S62_R1           Helix-3_S63_R1           Helix-4_S64_R1 
##                        1                        1                        1 
##          Hoskyn-1_S29_R1          Hoskyn-2_S30_R1          Hoskyn-3_S31_R1 
##                        1                        1                        1 
##          Hoskyn-4_S32_R1      JohnBrewer-1_S93_R1      JohnBrewer-2_S94_R1 
##                        1                        1                        1 
##      JohnBrewer-3_S97_R1      JohnBrewer-4_S98_R1           Kelso-1_S85_R1 
##                        1                        1                        1 
##           Kelso-2_S86_R1           Kelso-3_S87_R1           Kelso-4_S88_R1 
##                        1                        1                        1 
##           Knife-1_S45_R1           Knife-2_S46_R1           Knife-3_S47_R1 
##                        1                        1                        1 
##           Knife-4_S48_R1          Lagoon-1_S13_R1          Lagoon-2_S14_R1 
##                        1                        1                        1 
##          Lagoon-3_S15_R1          Lagoon-4_S16_R1     LittleKelso-1_S81_R1 
##                        1                        1                        1 
##     LittleKelso-2_S82_R1     LittleKelso-3_S83_R1     LittleKelso-4_S84_R1 
##                        1                        1                        1 
##          Lynchs-1_S99_R1         Lynchs-2_S100_R1         Lynchs-3_S101_R1 
##                        1                        1                        1 
##         Lynchs-4_S102_R1          Mantis-1_S85_R1          Mantis-2_S86_R1 
##                        1                        1                        1 
##          Mantis-3_S87_R1          Mantis-4_S88_R1        Masthead-1_S53_R1 
##                        1                        1                        1 
##        Masthead-2_S54_R1        Masthead-3_S55_R1        Masthead-4_S56_R1 
##                        1                        1                        1 
##       McCulloch-1_S17_R1       McCulloch-2_S18_R1       McCulloch-3_S19_R1 
##                        1                        1                        1 
##       McCulloch-4_S20_R1        McSweeney-1_S5_R1        McSweeney-2_S6_R1 
##                        1                        1                        1 
##        McSweeney-3_S7_R1        McSweeney-4_S8_R1         Monsoon-1_S21_R1 
##                        1                        1                        1 
##         Monsoon-2_S22_R1         Monsoon-3_S23_R1         Monsoon-4_S24_R1 
##                        1                        1                        1 
##           Moore-1_S25_R1           Moore-2_S26_R1           Moore-3_S27_R1 
##                        1                        1                        1 
##           Moore-4_S28_R1        Myrmidon-1_S53_R1        Myrmidon-2_S54_R1 
##                        1                        1                        1 
##        Myrmidon-3_S55_R1        Myrmidon-4_S56_R1           North-1_S37_R1 
##                        1                        1                        1 
##           North-2_S38_R1           North-3_S39_R1           North-4_S40_R1 
##                        1                        1                        1 
##           Peart-1_S13_R1           Peart-2_S14_R1           Peart-3_S15_R1 
##                        1                        1                        1 
##           Peart-4_S16_R1             Rib-1_S73_R1             Rib-2_S74_R1 
##                        1                        1                        1 
##             Rib-3_S75_R1             Rib-4_S76_R1        Roxburgh-1_S89_R1 
##                        1                        1                        1 
##        Roxburgh-2_S90_R1        Roxburgh-3_S91_R1        Roxburgh-4_S92_R1 
##                        1                        1                        1 
##        Sanbank1-1_S77_R1        Sanbank1-2_S78_R1        Sanbank1-3_S79_R1 
##                        1                        1                        1 
##        Sanbank1-4_S80_R1     SmallLagoon-1_S45_R1     SmallLagoon-2_S46_R1 
##                        1                        1                        1 
##     SmallLagoon-3_S47_R1     SmallLagoon-4_S48_R1      St-Crispin-1_S73_R1 
##                        1                        1                        1 
##      St-Crispin-2_S74_R1      St-Crispin-3_S75_R1      St-Crispin-4_S76_R1 
##                        1                        1                        1 
##           Taylor-1_S9_R1          Taylor-2_S10_R1          Taylor-3_S11_R1 
##                        1                        1                        1 
##          Taylor-4_S12_R1        Thetford-1_S29_R1        Thetford-2_S30_R1 
##                        1                        1                        1 
##        Thetford-3_S31_R1        Thetford-4_S32_R1 
##                        1                        1
bray_curtis_order <- vegdist(t(ORDER_RA), # needs transposing
                            method = "bray", # I am computing Bray Curtis dissimilarity
                            diag = F, # diagonals will be 0 as it's on the same samples
                            upper = TRUE) %>% # Only the upper bit of the matrix
  as.matrix() %>%  # Output as matrix
  reshape2::melt() %>% # Getting it in long format to have pairwise comparisons - this is needed for visualisation
  left_join(., data.frame(sample_data(megan_genus_clr)) %>% 
              rownames_to_column("Var1")) %>% # adding reef names for first pair in the BC sim matrix
  dplyr::select(c("Var1", "Var2", "value", "REEF_NAME")) %>% # Selecting only columns of interest
  dplyr::rename(., REEF_NAME_for_Var1 = REEF_NAME) %>% # Rename to know that reef names correspond to first samples in the BC sim matrix
  left_join(., data.frame(sample_data(megan_genus_clr)) %>% 
              rownames_to_column("Var2")) %>% # Now merging based on Var2
  dplyr::select(c("Var1", "Var2", "value", "REEF_NAME_for_Var1", "REEF_NAME")) %>% # Selecting only columns of interest
  dplyr::rename(., REEF_NAME_for_Var2 = REEF_NAME) # Rename to know that reef names correspond to second samples in the BC sim matrix
## Joining with `by = join_by(Var1)`
## Joining with `by = join_by(Var2)`
bray_curtis_order <- dplyr::filter(bray_curtis_order, REEF_NAME_for_Var1==REEF_NAME_for_Var2) %>% # Only selecting pairs where the reef names are the same, that way I only pull out dissimilarity values from the same site - Within replicate level! 
  dplyr::filter(value != 0) %>%  # Removing values that have a zero - those are comparisons between the same replicate! We don't need those as dissimilarity will be zero (the same replicates are completely identical)
  mutate(Bray_Curtis_similarity = 1-value) %>%  # computing Bray Curtis similarity as 1 - BC dissimilarity
  left_join(., data.frame(sample_data(megan_genus_clr))[,c(1,2)] %>% # Only getting columns Reef name and Sampling trip
              rownames_to_column("Var1"))
## Joining with `by = join_by(Var1)`
bray_curtis_order_median <- round(median(bray_curtis_order$Bray_Curtis_similarity), digits = 2)
bray_curtis_order_mean <- round(mean(bray_curtis_order$Bray_Curtis_similarity), digits = 2)
bray_curtis_order_SD <- round(sd(bray_curtis_order$Bray_Curtis_similarity), digits = 2)
bray_curtis_order_minimum <- round(min(bray_curtis_order$Bray_Curtis_similarity), digits = 2)

# Plotting as bocplots:
bray_curtis_order_boxplots <- bray_curtis_order  %>%
  ggplot(aes(x = "Within replicate for taxa (Order level)",
             y = Bray_Curtis_similarity,
#             col = Sampling_trip
             )) +
  geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
    geom_jitter(aes(color=Sampling_trip), size=0.4, alpha=0.2) +
    stat_summary(aes(label = paste("Median:", bray_curtis_order_median,
                                   "\nMean:", bray_curtis_order_mean,
                                   "\nSD:", bray_curtis_order_SD,
                                   "\nMin:", bray_curtis_order_minimum)),
  fun.y = median,
  geom = "text",
  color = "black"
) + # median and SD as text  
      stat_summary(fun=mean,
               geom="point",
               shape=20,
               size=1.5,
               color="seagreen1",
               fill="seagreen1") + # Plotting the mean as a green dot! 
    scale_color_manual(values = c("indianred", # Sampling trip 1
                "indianred4", # Sampling trip 2 
                "red3", # Sampling trip 3
                "slateblue") # Sampling trip 4
    ) +
  ylim(0, 1) + # Fixing the y axis to show min and max Bray Curtis dissimilarity
  ylab("Bray-Curtis similarity") +
  theme(legend.position = "none")
# Taxa - Class level
CLASS <- left_join(otu_table(megan_genus_abundant_known_phyla_only) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU"),
                                  tax_table(megan_genus_abundant_known_phyla_only) %>%
                                    as.data.frame() %>% 
                                    rownames_to_column("OTU")) %>% 
  column_to_rownames("OTU")
## Joining with `by = join_by(OTU)`
# Now summarising raw counts at class level
CLASS <- ddply(CLASS, "Class", numcolwise(sum)) %>% 
  column_to_rownames("Class")

# Ready to compute raw abundances per sample - by dividing cell value with column sum
CLASS_RA <- CLASS
for (i in 1:(ncol(CLASS))) { # '2:' as the first column is not numeric
  CLASS_RA[i] <- CLASS_RA[i] / sum(CLASS_RA[i]) 
}
# Checking that rel abunds sum up to 1:
colSums(CLASS_RA)
##          11-049-1_S89_R1          11-049-2_S90_R1          11-049-3_S91_R1 
##                        1                        1                        1 
##          11-049-4_S92_R1          11-162-1_S81_R1          11-162-2_S82_R1 
##                        1                        1                        1 
##          11-162-3_S83_R1          11-162-4_S84_R1           13-124-1_S9_R1 
##                        1                        1                        1 
##          13-124-2_S10_R1          13-124-3_S11_R1          13-124-4_S12_R1 
##                        1                        1                        1 
##          21-550-1_S69_R1          21-550-2_S70_R1          21-550-3_S71_R1 
##                        1                        1                        1 
##          21-550-4_S72_R1          21-580-1_S57_R1          21-580-2_S58_R1 
##                        1                        1                        1 
##          21-580-3_S59_R1          21-580-4_S60_R1          22-084-1_S41_R1 
##                        1                        1                        1 
##          22-084-2_S42_R1          22-084-3_S43_R1          22-084-4_S44_R1 
##                        1                        1                        1 
##      Agincourt1-1_S33_R1      Agincourt1-2_S34_R1      Agincourt1-3_S35_R1 
##                        1                        1                        1 
##      Agincourt1-4_S36_R1       Arlington-1_S37_R1       Arlington-2_S38_R1 
##                        1                        1                        1 
##       Arlington-3_S39_R1       Arlington-4_S40_R1           Boult-1_S25_R1 
##                        1                        1                        1 
##           Boult-2_S26_R1           Boult-3_S27_R1           Boult-4_S28_R1 
##                        1                        1                        1 
##      Broomfield-1_S49_R1      Broomfield-3_S51_R1      Broomfield-4_S52_R1 
##                        1                        1                        1 
## Broomfield-rpt-2_S115_R1       Centipede-1_S57_R1       Centipede-2_S58_R1 
##                        1                        1                        1 
##       Centipede-3_S59_R1       Centipede-4_S60_R1         Chicken-1_S69_R1 
##                        1                        1                        1 
##         Chicken-2_S70_R1         Chicken-3_S71_R1         Chicken-4_S72_R1 
##                        1                        1                        1 
##        Chinaman-1_S65_R1        Chinaman-2_S66_R1        Chinaman-3_S67_R1 
##                        1                        1                        1 
##        Chinaman-4_S68_R1         Corbett-1_S17_R1         Corbett-2_S18_R1 
##                        1                        1                        1 
##         Corbett-3_S19_R1         Corbett-4_S20_R1            Davie-1_S1_R1 
##                        1                        1                        1 
##            Davie-2_S2_R1            Davie-3_S3_R1            Davie-4_S4_R1 
##                        1                        1                        1 
##         Erskine-1_S61_R1         Erskine-2_S62_R1         Erskine-3_S63_R1 
##                        1                        1                        1 
##         Erskine-4_S64_R1         Fairfax-1_S33_R1         Fairfax-2_S34_R1 
##                        1                        1                        1 
##         Fairfax-3_S35_R1         Fairfax-4_S36_R1     Farquaharson-1_S1_R1 
##                        1                        1                        1 
##     Farquaharson-2_S2_R1     Farquaharson-3_S3_R1     Farquaharson-4_S4_R1 
##                        1                        1                        1 
##          Feather-1_S5_R1          Feather-2_S6_R1          Feather-3_S7_R1 
##                        1                        1                        1 
##          Feather-4_S8_R1    Fore-and-Aft-1_S77_R1    Fore-and-Aft-2_S78_R1 
##                        1                        1                        1 
##    Fore-and-Aft-3_S79_R1    Fore-and-Aft-4_S80_R1            Fork-1_S49_R1 
##                        1                        1                        1 
##            Fork-2_S50_R1            Fork-3_S51_R1            Fork-4_S52_R1 
##                        1                        1                        1 
##            Grub-1_S65_R1            Grub-2_S66_R1            Grub-3_S67_R1 
##                        1                        1                        1 
##            Grub-4_S68_R1        Hastings-1_S41_R1        Hastings-2_S42_R1 
##                        1                        1                        1 
##        Hastings-3_S43_R1        Hastings-4_S44_R1          Hedley-1_S21_R1 
##                        1                        1                        1 
##          Hedley-2_S22_R1          Hedley-3_S23_R1           Helix-1_S61_R1 
##                        1                        1                        1 
##           Helix-2_S62_R1           Helix-3_S63_R1           Helix-4_S64_R1 
##                        1                        1                        1 
##          Hoskyn-1_S29_R1          Hoskyn-2_S30_R1          Hoskyn-3_S31_R1 
##                        1                        1                        1 
##          Hoskyn-4_S32_R1      JohnBrewer-1_S93_R1      JohnBrewer-2_S94_R1 
##                        1                        1                        1 
##      JohnBrewer-3_S97_R1      JohnBrewer-4_S98_R1           Kelso-1_S85_R1 
##                        1                        1                        1 
##           Kelso-2_S86_R1           Kelso-3_S87_R1           Kelso-4_S88_R1 
##                        1                        1                        1 
##           Knife-1_S45_R1           Knife-2_S46_R1           Knife-3_S47_R1 
##                        1                        1                        1 
##           Knife-4_S48_R1          Lagoon-1_S13_R1          Lagoon-2_S14_R1 
##                        1                        1                        1 
##          Lagoon-3_S15_R1          Lagoon-4_S16_R1     LittleKelso-1_S81_R1 
##                        1                        1                        1 
##     LittleKelso-2_S82_R1     LittleKelso-3_S83_R1     LittleKelso-4_S84_R1 
##                        1                        1                        1 
##          Lynchs-1_S99_R1         Lynchs-2_S100_R1         Lynchs-3_S101_R1 
##                        1                        1                        1 
##         Lynchs-4_S102_R1          Mantis-1_S85_R1          Mantis-2_S86_R1 
##                        1                        1                        1 
##          Mantis-3_S87_R1          Mantis-4_S88_R1        Masthead-1_S53_R1 
##                        1                        1                        1 
##        Masthead-2_S54_R1        Masthead-3_S55_R1        Masthead-4_S56_R1 
##                        1                        1                        1 
##       McCulloch-1_S17_R1       McCulloch-2_S18_R1       McCulloch-3_S19_R1 
##                        1                        1                        1 
##       McCulloch-4_S20_R1        McSweeney-1_S5_R1        McSweeney-2_S6_R1 
##                        1                        1                        1 
##        McSweeney-3_S7_R1        McSweeney-4_S8_R1         Monsoon-1_S21_R1 
##                        1                        1                        1 
##         Monsoon-2_S22_R1         Monsoon-3_S23_R1         Monsoon-4_S24_R1 
##                        1                        1                        1 
##           Moore-1_S25_R1           Moore-2_S26_R1           Moore-3_S27_R1 
##                        1                        1                        1 
##           Moore-4_S28_R1        Myrmidon-1_S53_R1        Myrmidon-2_S54_R1 
##                        1                        1                        1 
##        Myrmidon-3_S55_R1        Myrmidon-4_S56_R1           North-1_S37_R1 
##                        1                        1                        1 
##           North-2_S38_R1           North-3_S39_R1           North-4_S40_R1 
##                        1                        1                        1 
##           Peart-1_S13_R1           Peart-2_S14_R1           Peart-3_S15_R1 
##                        1                        1                        1 
##           Peart-4_S16_R1             Rib-1_S73_R1             Rib-2_S74_R1 
##                        1                        1                        1 
##             Rib-3_S75_R1             Rib-4_S76_R1        Roxburgh-1_S89_R1 
##                        1                        1                        1 
##        Roxburgh-2_S90_R1        Roxburgh-3_S91_R1        Roxburgh-4_S92_R1 
##                        1                        1                        1 
##        Sanbank1-1_S77_R1        Sanbank1-2_S78_R1        Sanbank1-3_S79_R1 
##                        1                        1                        1 
##        Sanbank1-4_S80_R1     SmallLagoon-1_S45_R1     SmallLagoon-2_S46_R1 
##                        1                        1                        1 
##     SmallLagoon-3_S47_R1     SmallLagoon-4_S48_R1      St-Crispin-1_S73_R1 
##                        1                        1                        1 
##      St-Crispin-2_S74_R1      St-Crispin-3_S75_R1      St-Crispin-4_S76_R1 
##                        1                        1                        1 
##           Taylor-1_S9_R1          Taylor-2_S10_R1          Taylor-3_S11_R1 
##                        1                        1                        1 
##          Taylor-4_S12_R1        Thetford-1_S29_R1        Thetford-2_S30_R1 
##                        1                        1                        1 
##        Thetford-3_S31_R1        Thetford-4_S32_R1 
##                        1                        1
bray_curtis_class <- vegdist(t(CLASS_RA), # needs transposing
                            method = "bray", # I am computing Bray Curtis dissimilarity
                            diag = F, # diagonals will be 0 as it's on the same samples
                            upper = TRUE) %>% # Only the upper bit of the matrix
  as.matrix() %>%  # Output as matrix
  reshape2::melt() %>% # Getting it in long format to have pairwise comparisons - this is needed for visualisation
  left_join(., data.frame(sample_data(megan_genus_clr)) %>% 
              rownames_to_column("Var1")) %>% # adding reef names for first pair in the BC sim matrix
  dplyr::select(c("Var1", "Var2", "value", "REEF_NAME")) %>% # Selecting only columns of interest
  dplyr::rename(., REEF_NAME_for_Var1 = REEF_NAME) %>% # Rename to know that reef names correspond to first samples in the BC sim matrix
  left_join(., data.frame(sample_data(megan_genus_clr)) %>% 
              rownames_to_column("Var2")) %>% # Now merging based on Var2
  dplyr::select(c("Var1", "Var2", "value", "REEF_NAME_for_Var1", "REEF_NAME")) %>% # Selecting only columns of interest
  dplyr::rename(., REEF_NAME_for_Var2 = REEF_NAME) # Rename to know that reef names correspond to second samples in the BC sim matrix
## Joining with `by = join_by(Var1)`
## Joining with `by = join_by(Var2)`
bray_curtis_class <- dplyr::filter(bray_curtis_class, REEF_NAME_for_Var1==REEF_NAME_for_Var2) %>% # Only selecting pairs where the reef names are the same, that way I only pull out dissimilarity values from the same site - Within replicate level! 
  dplyr::filter(value != 0) %>%  # Removing values that have a zero - those are comparisons between the same replicate! We don't need those as dissimilarity will be zero (the same replicates are completely identical)
  mutate(Bray_Curtis_similarity = 1-value) %>%  # computing Bray Curtis similarity as 1 - BC dissimilarity
  left_join(., data.frame(sample_data(megan_genus_clr))[,c(1,2)] %>% # Only getting columns Reef name and Sampling trip
              rownames_to_column("Var1"))
## Joining with `by = join_by(Var1)`
bray_curtis_class_median <- round(median(bray_curtis_class$Bray_Curtis_similarity), digits = 2)
bray_curtis_class_SD <- round(sd(bray_curtis_class$Bray_Curtis_similarity), digits = 2)
bray_curtis_class_minimum <- round(min(bray_curtis_class$Bray_Curtis_similarity), digits = 2)

# Plotting as bocplots:
bray_curtis_class_boxplots <- bray_curtis_class  %>%
  ggplot(aes(x = "Within replicate for taxa (Class level)",
             y = Bray_Curtis_similarity,
#             col = Sampling_trip
             )) +
  geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
    geom_jitter(aes(color=Sampling_trip), size=0.4, alpha=0.2) +
    stat_summary(aes(label = paste("Median:", bray_curtis_class_median,
                                   "\nSD:", bray_curtis_class_SD,
                                   "\nMin:", bray_curtis_class_minimum)),
  fun.y = median,
  geom = "text",
  color = "black"
) + # median and SD as text  
      stat_summary(fun=mean,
               geom="point",
               shape=20,
               size=1.5,
               color="seagreen1",
               fill="seagreen1") + # Plotting the mean as a green dot! 
    scale_color_manual(values = c("indianred", # Sampling trip 1
                "indianred4", # Sampling trip 2 
                "red3", # Sampling trip 3
                "slateblue") # Sampling trip 4
    ) +
  ylim(0, 1) + # Fixing the y axis to show min and max Bray Curtis dissimilarity
  ylab("Bray-Curtis similarity") +
  theme(legend.position = "none")
# Taxa - PHYLUM level
PHYLUM <- left_join(otu_table(megan_genus_abundant_known_phyla_only) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU"),
                                  tax_table(megan_genus_abundant_known_phyla_only) %>%
                                    as.data.frame() %>% 
                                    rownames_to_column("OTU")) %>% 
  column_to_rownames("OTU")
## Joining with `by = join_by(OTU)`
# Now summarising raw counts at PHYLUM level
PHYLUM <- ddply(PHYLUM, "Phylum" , numcolwise(sum)) %>% 
  column_to_rownames("Phylum" )

# Ready to compute raw abundances per sample - by dividing cell value with column sum
PHYLUM_RA <- PHYLUM
for (i in 1:(ncol(PHYLUM))) { # '2:' as the first column is not numeric
  PHYLUM_RA[i] <- PHYLUM_RA[i] / sum(PHYLUM_RA[i]) 
}
# Checking that rel abunds sum up to 1:
colSums(PHYLUM_RA)
##          11-049-1_S89_R1          11-049-2_S90_R1          11-049-3_S91_R1 
##                        1                        1                        1 
##          11-049-4_S92_R1          11-162-1_S81_R1          11-162-2_S82_R1 
##                        1                        1                        1 
##          11-162-3_S83_R1          11-162-4_S84_R1           13-124-1_S9_R1 
##                        1                        1                        1 
##          13-124-2_S10_R1          13-124-3_S11_R1          13-124-4_S12_R1 
##                        1                        1                        1 
##          21-550-1_S69_R1          21-550-2_S70_R1          21-550-3_S71_R1 
##                        1                        1                        1 
##          21-550-4_S72_R1          21-580-1_S57_R1          21-580-2_S58_R1 
##                        1                        1                        1 
##          21-580-3_S59_R1          21-580-4_S60_R1          22-084-1_S41_R1 
##                        1                        1                        1 
##          22-084-2_S42_R1          22-084-3_S43_R1          22-084-4_S44_R1 
##                        1                        1                        1 
##      Agincourt1-1_S33_R1      Agincourt1-2_S34_R1      Agincourt1-3_S35_R1 
##                        1                        1                        1 
##      Agincourt1-4_S36_R1       Arlington-1_S37_R1       Arlington-2_S38_R1 
##                        1                        1                        1 
##       Arlington-3_S39_R1       Arlington-4_S40_R1           Boult-1_S25_R1 
##                        1                        1                        1 
##           Boult-2_S26_R1           Boult-3_S27_R1           Boult-4_S28_R1 
##                        1                        1                        1 
##      Broomfield-1_S49_R1      Broomfield-3_S51_R1      Broomfield-4_S52_R1 
##                        1                        1                        1 
## Broomfield-rpt-2_S115_R1       Centipede-1_S57_R1       Centipede-2_S58_R1 
##                        1                        1                        1 
##       Centipede-3_S59_R1       Centipede-4_S60_R1         Chicken-1_S69_R1 
##                        1                        1                        1 
##         Chicken-2_S70_R1         Chicken-3_S71_R1         Chicken-4_S72_R1 
##                        1                        1                        1 
##        Chinaman-1_S65_R1        Chinaman-2_S66_R1        Chinaman-3_S67_R1 
##                        1                        1                        1 
##        Chinaman-4_S68_R1         Corbett-1_S17_R1         Corbett-2_S18_R1 
##                        1                        1                        1 
##         Corbett-3_S19_R1         Corbett-4_S20_R1            Davie-1_S1_R1 
##                        1                        1                        1 
##            Davie-2_S2_R1            Davie-3_S3_R1            Davie-4_S4_R1 
##                        1                        1                        1 
##         Erskine-1_S61_R1         Erskine-2_S62_R1         Erskine-3_S63_R1 
##                        1                        1                        1 
##         Erskine-4_S64_R1         Fairfax-1_S33_R1         Fairfax-2_S34_R1 
##                        1                        1                        1 
##         Fairfax-3_S35_R1         Fairfax-4_S36_R1     Farquaharson-1_S1_R1 
##                        1                        1                        1 
##     Farquaharson-2_S2_R1     Farquaharson-3_S3_R1     Farquaharson-4_S4_R1 
##                        1                        1                        1 
##          Feather-1_S5_R1          Feather-2_S6_R1          Feather-3_S7_R1 
##                        1                        1                        1 
##          Feather-4_S8_R1    Fore-and-Aft-1_S77_R1    Fore-and-Aft-2_S78_R1 
##                        1                        1                        1 
##    Fore-and-Aft-3_S79_R1    Fore-and-Aft-4_S80_R1            Fork-1_S49_R1 
##                        1                        1                        1 
##            Fork-2_S50_R1            Fork-3_S51_R1            Fork-4_S52_R1 
##                        1                        1                        1 
##            Grub-1_S65_R1            Grub-2_S66_R1            Grub-3_S67_R1 
##                        1                        1                        1 
##            Grub-4_S68_R1        Hastings-1_S41_R1        Hastings-2_S42_R1 
##                        1                        1                        1 
##        Hastings-3_S43_R1        Hastings-4_S44_R1          Hedley-1_S21_R1 
##                        1                        1                        1 
##          Hedley-2_S22_R1          Hedley-3_S23_R1           Helix-1_S61_R1 
##                        1                        1                        1 
##           Helix-2_S62_R1           Helix-3_S63_R1           Helix-4_S64_R1 
##                        1                        1                        1 
##          Hoskyn-1_S29_R1          Hoskyn-2_S30_R1          Hoskyn-3_S31_R1 
##                        1                        1                        1 
##          Hoskyn-4_S32_R1      JohnBrewer-1_S93_R1      JohnBrewer-2_S94_R1 
##                        1                        1                        1 
##      JohnBrewer-3_S97_R1      JohnBrewer-4_S98_R1           Kelso-1_S85_R1 
##                        1                        1                        1 
##           Kelso-2_S86_R1           Kelso-3_S87_R1           Kelso-4_S88_R1 
##                        1                        1                        1 
##           Knife-1_S45_R1           Knife-2_S46_R1           Knife-3_S47_R1 
##                        1                        1                        1 
##           Knife-4_S48_R1          Lagoon-1_S13_R1          Lagoon-2_S14_R1 
##                        1                        1                        1 
##          Lagoon-3_S15_R1          Lagoon-4_S16_R1     LittleKelso-1_S81_R1 
##                        1                        1                        1 
##     LittleKelso-2_S82_R1     LittleKelso-3_S83_R1     LittleKelso-4_S84_R1 
##                        1                        1                        1 
##          Lynchs-1_S99_R1         Lynchs-2_S100_R1         Lynchs-3_S101_R1 
##                        1                        1                        1 
##         Lynchs-4_S102_R1          Mantis-1_S85_R1          Mantis-2_S86_R1 
##                        1                        1                        1 
##          Mantis-3_S87_R1          Mantis-4_S88_R1        Masthead-1_S53_R1 
##                        1                        1                        1 
##        Masthead-2_S54_R1        Masthead-3_S55_R1        Masthead-4_S56_R1 
##                        1                        1                        1 
##       McCulloch-1_S17_R1       McCulloch-2_S18_R1       McCulloch-3_S19_R1 
##                        1                        1                        1 
##       McCulloch-4_S20_R1        McSweeney-1_S5_R1        McSweeney-2_S6_R1 
##                        1                        1                        1 
##        McSweeney-3_S7_R1        McSweeney-4_S8_R1         Monsoon-1_S21_R1 
##                        1                        1                        1 
##         Monsoon-2_S22_R1         Monsoon-3_S23_R1         Monsoon-4_S24_R1 
##                        1                        1                        1 
##           Moore-1_S25_R1           Moore-2_S26_R1           Moore-3_S27_R1 
##                        1                        1                        1 
##           Moore-4_S28_R1        Myrmidon-1_S53_R1        Myrmidon-2_S54_R1 
##                        1                        1                        1 
##        Myrmidon-3_S55_R1        Myrmidon-4_S56_R1           North-1_S37_R1 
##                        1                        1                        1 
##           North-2_S38_R1           North-3_S39_R1           North-4_S40_R1 
##                        1                        1                        1 
##           Peart-1_S13_R1           Peart-2_S14_R1           Peart-3_S15_R1 
##                        1                        1                        1 
##           Peart-4_S16_R1             Rib-1_S73_R1             Rib-2_S74_R1 
##                        1                        1                        1 
##             Rib-3_S75_R1             Rib-4_S76_R1        Roxburgh-1_S89_R1 
##                        1                        1                        1 
##        Roxburgh-2_S90_R1        Roxburgh-3_S91_R1        Roxburgh-4_S92_R1 
##                        1                        1                        1 
##        Sanbank1-1_S77_R1        Sanbank1-2_S78_R1        Sanbank1-3_S79_R1 
##                        1                        1                        1 
##        Sanbank1-4_S80_R1     SmallLagoon-1_S45_R1     SmallLagoon-2_S46_R1 
##                        1                        1                        1 
##     SmallLagoon-3_S47_R1     SmallLagoon-4_S48_R1      St-Crispin-1_S73_R1 
##                        1                        1                        1 
##      St-Crispin-2_S74_R1      St-Crispin-3_S75_R1      St-Crispin-4_S76_R1 
##                        1                        1                        1 
##           Taylor-1_S9_R1          Taylor-2_S10_R1          Taylor-3_S11_R1 
##                        1                        1                        1 
##          Taylor-4_S12_R1        Thetford-1_S29_R1        Thetford-2_S30_R1 
##                        1                        1                        1 
##        Thetford-3_S31_R1        Thetford-4_S32_R1 
##                        1                        1
bray_curtis_phylum <- vegdist(t(PHYLUM_RA), # needs transposing
                            method = "bray", # I am computing Bray Curtis dissimilarity
                            diag = F, # diagonals will be 0 as it's on the same samples
                            upper = TRUE) %>% # Only the upper bit of the matrix
  as.matrix() %>%  # Output as matrix
  reshape2::melt() %>% # Getting it in long format to have pairwise comparisons - this is needed for visualisation
  left_join(., data.frame(sample_data(megan_genus_clr)) %>% 
              rownames_to_column("Var1")) %>% # adding reef names for first pair in the BC sim matrix
  dplyr::select(c("Var1", "Var2", "value", "REEF_NAME")) %>% # Selecting only columns of interest
  dplyr::rename(., REEF_NAME_for_Var1 = REEF_NAME) %>% # Rename to know that reef names correspond to first samples in the BC sim matrix
  left_join(., data.frame(sample_data(megan_genus_clr)) %>% 
              rownames_to_column("Var2")) %>% # Now merging based on Var2
  dplyr::select(c("Var1", "Var2", "value", "REEF_NAME_for_Var1", "REEF_NAME")) %>% # Selecting only columns of interest
  dplyr::rename(., REEF_NAME_for_Var2 = REEF_NAME) # Rename to know that reef names correspond to second samples in the BC sim matrix
## Joining with `by = join_by(Var1)`
## Joining with `by = join_by(Var2)`
bray_curtis_phylum <- dplyr::filter(bray_curtis_phylum, REEF_NAME_for_Var1==REEF_NAME_for_Var2) %>% # Only selecting pairs where the reef names are the same, that way I only pull out dissimilarity values from the same site - Within replicate level! 
  dplyr::filter(value != 0) %>%  # Removing values that have a zero - those are comparisons between the same replicate! We don't need those as dissimilarity will be zero (the same replicates are completely identical)
  mutate(Bray_Curtis_similarity = 1-value) %>%  # computing Bray Curtis similarity as 1 - BC dissimilarity
  left_join(., data.frame(sample_data(megan_genus_clr))[,c(1,2)] %>% # Only getting columns Reef name and Sampling trip
              rownames_to_column("Var1"))
## Joining with `by = join_by(Var1)`
bray_curtis_phylum_median <- round(median(bray_curtis_phylum$Bray_Curtis_similarity), digits = 2)
bray_curtis_phylum_SD <- round(sd(bray_curtis_phylum$Bray_Curtis_similarity), digits = 2)
bray_curtis_phylum_minimum <- round(min(bray_curtis_phylum$Bray_Curtis_similarity), digits = 2)

# Plotting as bocplots:
bray_curtis_phylum_boxplots <- bray_curtis_phylum  %>%
  ggplot(aes(x = "Within replicate for taxa (Phylum level)",
             y = Bray_Curtis_similarity,
#             col = Sampling_trip
             )) +
  geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
    geom_jitter(aes(color=Sampling_trip), size=0.4, alpha=0.2) +
    stat_summary(aes(label = paste("Median:", bray_curtis_phylum_median,
                                   "\nSD:", bray_curtis_phylum_SD,
                                   "\nMin:", bray_curtis_phylum_minimum)),
  fun.y = median,
  geom = "text",
  color = "black"
) + # median and SD as text  
      stat_summary(fun=mean,
               geom="point",
               shape=20,
               size=1.5,
               color="seagreen1",
               fill="seagreen1") + # Plotting the mean as a green dot! 
    scale_color_manual(values = c("indianred", # Sampling trip 1
                "indianred4", # Sampling trip 2 
                "red3", # Sampling trip 3
                "slateblue") # Sampling trip 4
    ) +
  ylim(0, 1) + # Fixing the y axis to show min and max Bray Curtis dissimilarity
  ylab("Bray-Curtis similarity") +
  theme(legend.position = "none")
# GO terms - Rank 3
bray_curtis_GO3 <- vegdist(megan_GO_3_RA_no_rare@otu_table %>% # Getting my table with relative abundances
                              t() %>% # needs to be transposed though
                              as.data.frame(), # df
                            method = "bray", # I am computing Bray Curtis dissimilarity
                            diag = F, # diagonals will be 0 as it's on the same samples
                            upper = TRUE) %>% # Only the upper bit of the matrix
  as.matrix() %>%  # Output as matrix
  reshape2::melt() %>% # Getting it in long format to have pairwise comparisons - this is needed for visualisation
  left_join(., data.frame(sample_data(megan_go_clr_3)) %>% 
              rownames_to_column("Var1")) %>% # adding reef names for first pair in the BC sim matrix
  dplyr::select(c("Var1", "Var2", "value", "REEF_NAME")) %>% # Selecting only columns of interest
  dplyr::rename(., REEF_NAME_for_Var1 = REEF_NAME) %>% # Rename to know that reef names correspond to first samples in the BC sim matrix
  left_join(., data.frame(sample_data(megan_go_clr_3)) %>% 
              rownames_to_column("Var2")) %>% # Now merging based on Var2
  dplyr::select(c("Var1", "Var2", "value", "REEF_NAME_for_Var1", "REEF_NAME")) %>% # Selecting only columns of interest
  dplyr::rename(., REEF_NAME_for_Var2 = REEF_NAME) # Rename to know that reef names correspond to second samples in the BC sim matrix
## Joining with `by = join_by(Var1)`
## Joining with `by = join_by(Var2)`
bray_curtis_GO3 <- dplyr::filter(bray_curtis_GO3, REEF_NAME_for_Var1==REEF_NAME_for_Var2) %>% # Only selecting pairs where the reef names are the same, that way I only pull out dissimilarity values from the same site - Within replicate level! 
  dplyr::filter(value != 0) %>%  # Removing values that have a zero - those are comparisons between the same replicate! We don't need those as dissimilarity will be zero (the same replicates are completely identical)
  mutate(Bray_Curtis_similarity = 1-value) %>%  # computing Bray Curtis similarity as 1 - BC dissimilarity
  left_join(., data.frame(sample_data(megan_go_clr_3))[,c(1,2)] %>% # Only getting columns Reef name and Sampling trip
              rownames_to_column("Var1"))
## Joining with `by = join_by(Var1)`
bray_curtis_GO3_median <- round(median(bray_curtis_GO3$Bray_Curtis_similarity), digits = 2)
bray_curtis_GO3_SD <- round(sd(bray_curtis_GO3$Bray_Curtis_similarity), digits = 2)
bray_curtis_GO3_minimum <- round(min(bray_curtis_GO3$Bray_Curtis_similarity), digits = 2)

# Plotting as bocplots:
bray_curtis_GO3_boxplots <- bray_curtis_GO3  %>%
  ggplot(aes(x = "Within replicate for GO terms (Rank 3)",
             y = Bray_Curtis_similarity,
#             col = Sampling_trip
             )) +
  geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
    geom_jitter(aes(color=Sampling_trip), size=0.4, alpha=0.2) +
    stat_summary(aes(label = paste("Median:", bray_curtis_GO3_median,
                                   "\nSD:", bray_curtis_GO3_SD,
                                   "\nMin:", bray_curtis_GO3_minimum)),
  fun.y = median,
  geom = "text",
  color = "black"
) + # median and SD as text
      stat_summary(fun=mean,
               geom="point",
               shape=20,
               size=1.5,
               color="seagreen1",
               fill="seagreen1") + # Plotting the mean as a green dot! 
    scale_color_manual(values = c("indianred", # Sampling trip 1
                "indianred4", # Sampling trip 2 
                "red3", # Sampling trip 3
                "slateblue") # Sampling trip 4
    ) +
  ylim(0, 1) + # Fixing the y axis to show min and max Bray Curtis dissimilarity
  ylab("Bray-Curtis similarity") +
  theme(legend.position = "none")
# GO terms - Rank 4
bray_curtis_GO4 <- vegdist(megan_GO_4_RA_no_rare@otu_table %>% # Getting my table with relative abundances
                              t() %>% # needs to be transposed though
                              as.data.frame(), # df
                            method = "bray", # I am computing Bray Curtis dissimilarity
                            diag = F, # diagonals will be 0 as it's on the same samples
                            upper = TRUE) %>% # Only the upper bit of the matrix
  as.matrix() %>%  # Output as matrix
  reshape2::melt() %>% # Getting it in long format to have pairwise comparisons - this is needed for visualisation
  left_join(., data.frame(sample_data(megan_go_clr_4)) %>% 
              rownames_to_column("Var1")) %>% # adding reef names for first pair in the BC sim matrix
  dplyr::select(c("Var1", "Var2", "value", "REEF_NAME")) %>% # Selecting only columns of interest
  dplyr::rename(., REEF_NAME_for_Var1 = REEF_NAME) %>% # Rename to know that reef names correspond to first samples in the BC sim matrix
  left_join(., data.frame(sample_data(megan_go_clr_4)) %>% 
              rownames_to_column("Var2")) %>% # Now merging based on Var2
  dplyr::select(c("Var1", "Var2", "value", "REEF_NAME_for_Var1", "REEF_NAME")) %>% # Selecting only columns of interest
  dplyr::rename(., REEF_NAME_for_Var2 = REEF_NAME) # Rename to know that reef names correspond to second samples in the BC sim matrix
## Joining with `by = join_by(Var1)`
## Joining with `by = join_by(Var2)`
bray_curtis_GO4 <- dplyr::filter(bray_curtis_GO4, REEF_NAME_for_Var1==REEF_NAME_for_Var2) %>% # Only selecting pairs where the reef names are the same, that way I only pull out dissimilarity values from the same site - Within replicate level! 
  dplyr::filter(value != 0) %>%  # Removing values that have a zero - those are comparisons between the same replicate! We don't need those as dissimilarity will be zero (the same replicates are completely identical)
  mutate(Bray_Curtis_similarity = 1-value) %>%  # computing Bray Curtis similarity as 1 - BC dissimilarity
  left_join(., data.frame(sample_data(megan_go_clr_4))[,c(1,2)] %>% # Only getting columns Reef name and Sampling trip
              rownames_to_column("Var1"))
## Joining with `by = join_by(Var1)`
bray_curtis_GO4_median <- round(median(bray_curtis_GO4$Bray_Curtis_similarity), digits = 2)
bray_curtis_GO4_mean <- round(mean(bray_curtis_GO4$Bray_Curtis_similarity), digits = 2)
bray_curtis_GO4_SD <- round(sd(bray_curtis_GO4$Bray_Curtis_similarity), digits = 2)
bray_curtis_GO4_minimum <- round(min(bray_curtis_GO4$Bray_Curtis_similarity), digits = 2)

# Plotting as bocplots:
bray_curtis_GO4_boxplots <- bray_curtis_GO4  %>%
  ggplot(aes(x = "Within replicate for GO terms (Rank 4)",
             y = Bray_Curtis_similarity,
#             col = Sampling_trip
             )) +
  geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
    geom_jitter(aes(color=Sampling_trip), size=0.4, alpha=0.2) +
    stat_summary(aes(label = paste("Median:", bray_curtis_GO4_median,
                                   "\nMean:", bray_curtis_GO4_mean,
                                   "\nSD:", bray_curtis_GO4_SD,
                                   "\nMin:", bray_curtis_GO4_minimum)),
  fun.y = median,
  geom = "text",
  color = "black"
) + # median and SD as text  
      stat_summary(fun=mean,
               geom="point",
               shape=20,
               size=1.5,
               color="seagreen1",
               fill="seagreen1") + # Plotting the mean as a green dot! 
    scale_color_manual(values = c("indianred", # Sampling trip 1
                "indianred4", # Sampling trip 2 
                "red3", # Sampling trip 3
                "slateblue") # Sampling trip 4
    ) +
  ylim(0, 1) + # Fixing the y axis to show min and max Bray Curtis dissimilarity
  ylab("Bray-Curtis similarity") +
  theme(legend.position = "none")
# GO terms - Rank 4
bray_curtis_GO5 <- vegdist(megan_GO_5_RA_no_rare@otu_table %>% # Getting my table with relative abundances
                              t() %>% # needs to be transposed though
                              as.data.frame(), # df
                            method = "bray", # I am computing Bray Curtis dissimilarity
                            diag = F, # diagonals will be 0 as it's on the same samples
                            upper = TRUE) %>% # Only the upper bit of the matrix
  as.matrix() %>%  # Output as matrix
  reshape2::melt() %>% # Getting it in long format to have pairwise comparisons - this is needed for visualisation
  left_join(., data.frame(sample_data(megan_go_clr_5)) %>% 
              rownames_to_column("Var1")) %>% # adding reef names for first pair in the BC sim matrix
  dplyr::select(c("Var1", "Var2", "value", "REEF_NAME")) %>% # Selecting only columns of interest
  dplyr::rename(., REEF_NAME_for_Var1 = REEF_NAME) %>% # Rename to know that reef names correspond to first samples in the BC sim matrix
  left_join(., data.frame(sample_data(megan_go_clr_5)) %>% 
              rownames_to_column("Var2")) %>% # Now merging based on Var2
  dplyr::select(c("Var1", "Var2", "value", "REEF_NAME_for_Var1", "REEF_NAME")) %>% # Selecting only columns of interest
  dplyr::rename(., REEF_NAME_for_Var2 = REEF_NAME) # Rename to know that reef names correspond to second samples in the BC sim matrix
## Joining with `by = join_by(Var1)`
## Joining with `by = join_by(Var2)`
bray_curtis_GO5 <- dplyr::filter(bray_curtis_GO5, REEF_NAME_for_Var1==REEF_NAME_for_Var2) %>% # Only selecting pairs where the reef names are the same, that way I only pull out dissimilarity values from the same site - Within replicate level! 
  dplyr::filter(value != 0) %>%  # Removing values that have a zero - those are comparisons between the same replicate! We don't need those as dissimilarity will be zero (the same replicates are completely identical)
  mutate(Bray_Curtis_similarity = 1-value) %>%  # computing Bray Curtis similarity as 1 - BC dissimilarity
  left_join(., data.frame(sample_data(megan_go_clr_5))[,c(1,2)] %>% # Only getting columns Reef name and Sampling trip
              rownames_to_column("Var1"))
## Joining with `by = join_by(Var1)`
bray_curtis_GO5_median <- round(median(bray_curtis_GO5$Bray_Curtis_similarity), digits = 2)
bray_curtis_GO5_mean <- round(mean(bray_curtis_GO5$Bray_Curtis_similarity), digits = 2)
bray_curtis_GO5_SD <- round(sd(bray_curtis_GO5$Bray_Curtis_similarity), digits = 2)
bray_curtis_GO5_minimum <- round(min(bray_curtis_GO5$Bray_Curtis_similarity), digits = 2)

# Plotting as bocplots:
bray_curtis_GO5_boxplots <- bray_curtis_GO5  %>%
  ggplot(aes(x = "Within replicate for GO terms (Rank 5)",
             y = Bray_Curtis_similarity,
#             col = Sampling_trip
             )) +
  geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
    geom_jitter(aes(color=Sampling_trip), size=0.4, alpha=0.2) +
    stat_summary(aes(label = paste("Median:", bray_curtis_GO5_median,
                                   "\nMean:", bray_curtis_GO5_mean,
                                   "\nSD:", bray_curtis_GO5_SD,
                                   "\nMin:", bray_curtis_GO5_minimum)),
  fun.y = median,
  geom = "text",
  color = "black"
) + # median and SD as text  
      stat_summary(fun=mean,
               geom="point",
               shape=20,
               size=1.5,
               color="seagreen1",
               fill="seagreen1") + # Plotting the mean as a green dot! 
    scale_color_manual(values = c("indianred", # Sampling trip 1
                "indianred4", # Sampling trip 2 
                "red3", # Sampling trip 3
                "slateblue") # Sampling trip 4
    ) +
  ylim(0, 1) + # Fixing the y axis to show min and max Bray Curtis dissimilarity
  ylab("Bray-Curtis similarity") +
  theme(legend.position = "none")
plot_grid(bray_curtis_genus_boxplots, # taxonomy first:
          bray_curtis_family_boxplots,
          bray_curtis_order_boxplots,
          bray_curtis_class_boxplots,
          bray_curtis_phylum_boxplots,
          bray_curtis_GO5_boxplots, # Microbial function
          bray_curtis_GO4_boxplots,
          bray_curtis_GO3_boxplots,
ncol = 6, nrow = 1)

Coupling sPLS and CV (4-fold x 50 repeats) to compute stability scores

The global phyloseq object was also split into four subsets (separate phyloseq objects within each of the four transects), to explore how microbial taxa/genes correlate to WQ metrics within each of the trips, focussing primarily on stability.

Trip 1

We can examine how frequently each variable is selected when we subsample the data using the perf() function to measure how stable the signature is.

stab.trip1.final.spls2.WQ.taxa <- perf(trip1.final.spls2.WQ.taxa,
                                       validation = 'Mfold',
                                       folds = 4,
                                       nrepeat = 50)
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
stab.trip1.final.spls2.WQ.GOs <- perf(trip1.final.spls2.WQ.GOs,
                                       validation = 'Mfold',
                                       folds = 4,
                                       nrepeat = 50)
# Extract stability - Dimension 1
stab.Taxa.trip1.comp1 <- stab.trip1.final.spls2.WQ.taxa$features$stability.X$comp1
stab.GOs.trip1.comp1 <- stab.trip1.final.spls2.WQ.GOs$features$stability.X$comp1
# We extract the stability measures of only the variables selected in spls2
extr.stab.Taxa.trip1.comp1 <- stab.Taxa.trip1.comp1[selectVar(trip1.final.spls2.WQ.taxa,
comp =1)$X$name]
extr.stab.GOs.trip1.comp1 <- stab.GOs.trip1.comp1[selectVar(trip1.final.spls2.WQ.GOs,
comp =1)$X$name]

# Plotting stability scores as boxplots, while simultaneously doing the Wilcoxon rank sum test - are these differences statistically significant?

# Preparing the object first:
Wilcoxon_trip_1_stability <- bind_rows(as.data.frame(extr.stab.Taxa.trip1.comp1) %>%
                   dplyr::rename(., Stability_scores_Trip_1 = extr.stab.Taxa.trip1.comp1) %>%
                   mutate(Comparison = "1_Taxa"),
                 as.data.frame(extr.stab.GOs.trip1.comp1) %>%
                   dplyr::rename(., Stability_scores_Trip_1 = extr.stab.GOs.trip1.comp1) %>% 
                   mutate(Comparison = "2_Functions"))
# Exporting this as a numerical value as well:
write.csv(Wilcoxon_trip_1_stability, "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/output_tables/trip_1_sPLS_stability_scores.csv", row.names = F)

# Stability scores - getting median and SD: https://stackoverflow.com/questions/13372734/how-to-display-the-median-value-in-a-faceted-boxplot-in-ggplot
trip_1_stability_numerical_summ <- ddply(Wilcoxon_trip_1_stability,
                                              .(Comparison),
                                              summarize,
                                              med = median(Stability_scores_Trip_1),
                                              SD = sd(Stability_scores_Trip_1))

# Now plotting:
ggplot(Wilcoxon_trip_1_stability,
       aes(x = Comparison, y = Stability_scores_Trip_1)) +
  geom_boxplot(fill = "indianred", outlier.shape = NA) +
  geom_jitter(size=0.8, alpha=0.5) +
  geom_text(data = trip_1_stability_numerical_summ, aes(y = med, label = round(med,2)),size = 4.5, vjust = -0.5) + # adding median as text
  geom_text(data = trip_1_stability_numerical_summ, aes(y = SD, label = round(SD,2)),size = 4.5, vjust = -0.5) + # adding median as text  
  ylim(0,1) +
  labs(#x = "Microbial function",
       y = "Re-occurrence of indicator features at PC 1 (4-fold CV, 50 reps)"
       ) +
  theme_bw() +
  stat_pvalue_manual(Wilcoxon_trip_1_stability %>% 
                       pairwise_wilcox_test(Stability_scores_Trip_1 ~ Comparison) %>% 
                       add_xy_position())
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_text()`).

# Then I edited this manually in Inkscape

Trip 2

# Optimal parameters
keepX <- c(10, 10) # Keeping the same value across all trips!
trip2.final.spls2.WQ.taxa <- spls(X = OTUs_Trip2,
                            Y = metadata_Trip2[,24:40], # Choosing only medians!
                            ncomp = 2,
                            keepX = keepX,
                            mode = "regression")
trip2.final.spls2.WQ.GOs <- spls(X = GOs_Trip2,
                            Y = metadata_Trip2[,24:40], # Choosing only medians!
                            ncomp = 2,
                            keepX = keepX,
                            mode = "regression")

We can examine how frequently each variable is selected when we subsample the data using the perf() function to measure how stable the signature is.

stab.trip2.final.spls2.WQ.taxa <- perf(trip2.final.spls2.WQ.taxa,
                                       validation = 'Mfold',
                                       folds = 4,
                                       nrepeat = 20)
stab.trip2.final.spls2.WQ.GOs <- perf(trip2.final.spls2.WQ.GOs,
                                       validation = 'Mfold',
                                       folds = 4,
                                       nrepeat = 50)
# Extract stability - Dimension 1
stab.Taxa.trip2.comp1 <- stab.trip2.final.spls2.WQ.taxa$features$stability.X$comp1
stab.GOs.trip2.comp1 <- stab.trip2.final.spls2.WQ.GOs$features$stability.X$comp1
# We extract the stability measures of only the variables selected in spls2
extr.stab.Taxa.trip2.comp1 <- stab.Taxa.trip2.comp1[selectVar(trip2.final.spls2.WQ.taxa,
comp =1)$X$name]
extr.stab.GOs.trip2.comp1 <- stab.GOs.trip2.comp1[selectVar(trip2.final.spls2.WQ.GOs,
comp =1)$X$name]

# Plotting stability scores as boxplots, while simultaneously doing the Wilcoxon rank sum test - are these differences statistically significant?

# Preparing the object first:
Wilcoxon_trip_2_stability <- bind_rows(as.data.frame(extr.stab.Taxa.trip2.comp1) %>%
                   dplyr::rename(., Stability_scores_Trip_2 = extr.stab.Taxa.trip2.comp1) %>%
                   mutate(Comparison = "1_Taxa"),
                 as.data.frame(extr.stab.GOs.trip2.comp1) %>%
                   dplyr::rename(., Stability_scores_Trip_2 = extr.stab.GOs.trip2.comp1) %>% 
                   mutate(Comparison = "2_Functions"))
# Exporting this as a numerical value as well:
write.csv(Wilcoxon_trip_2_stability, "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/output_tables/trip_2_sPLS_stability_scores.csv", row.names = F)

# Stability scores - getting median and SD: https://stackoverflow.com/questions/13372734/how-to-display-the-median-value-in-a-faceted-boxplot-in-ggplot
trip_2_stability_numerical_summ <- ddply(Wilcoxon_trip_2_stability,
                                              .(Comparison),
                                              summarize,
                                              med = median(Stability_scores_Trip_2),
                                              SD = sd(Stability_scores_Trip_2))

# Now plotting:
ggplot(Wilcoxon_trip_2_stability,
       aes(x = Comparison, y = Stability_scores_Trip_2)) +
  geom_boxplot(fill = "indianred4", outlier.shape = NA) +
  geom_jitter(size=0.8, alpha=0.5) +
  geom_text(data = trip_2_stability_numerical_summ, aes(y = med, label = round(med,2)),size = 4.5, vjust = -0.5) + # adding median as text
  geom_text(data = trip_2_stability_numerical_summ, aes(y = SD, label = round(SD,2)),size = 4.5, vjust = -0.5) + # adding median as text  
  ylim(0,1) +
  labs(#x = "Microbial function",
       y = "Re-occurrence of indicator features at PC 1 (4-fold CV, 50 reps)"
       ) +
  theme_bw() +
  stat_pvalue_manual(Wilcoxon_trip_2_stability %>% 
                       pairwise_wilcox_test(Stability_scores_Trip_2 ~ Comparison) %>% 
                       add_xy_position())

# Then I edited this manually in Inkscape

Trip 3

# Optimal parameters
keepX <- c(10, 10) # Keeping the same value across all trips!
trip3.final.spls2.WQ.taxa <- spls(X = OTUs_Trip3,
                            Y = metadata_Trip3.imputed, # Choosing only medians!
                            ncomp = 2,
                            keepX = keepX,
                            mode = "regression")
trip3.final.spls2.WQ.GOs <- spls(X = GOs_Trip3,
                            Y = metadata_Trip3.imputed, # Choosing only medians!
                            ncomp = 2,
                            keepX = keepX,
                            mode = "regression")

We can examine how frequently each variable is selected when we subsample the data using the perf() function to measure how stable the signature is.

stab.trip3.final.spls2.WQ.taxa <- perf(trip3.final.spls2.WQ.taxa,
                                       validation = 'Mfold',
                                       folds = 4,
                                       nrepeat = 20)
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
stab.trip3.final.spls2.WQ.GOs <- perf(trip3.final.spls2.WQ.GOs,
                                       validation = 'Mfold',
                                       folds = 4,
                                       nrepeat = 50)
# Extract stability - Dimension 1
stab.Taxa.trip3.comp1 <- stab.trip3.final.spls2.WQ.taxa$features$stability.X$comp1
stab.GOs.trip3.comp1 <- stab.trip3.final.spls2.WQ.GOs$features$stability.X$comp1
# We extract the stability measures of only the variables selected in spls2
extr.stab.Taxa.trip3.comp1 <- stab.Taxa.trip3.comp1[selectVar(trip3.final.spls2.WQ.taxa,
comp =1)$X$name]
extr.stab.GOs.trip3.comp1 <- stab.GOs.trip3.comp1[selectVar(trip3.final.spls2.WQ.GOs,
comp =1)$X$name]

# Plotting stability scores as boxplots, while simultaneously doing the Wilcoxon rank sum test - are these differences statistically significant?

# Preparing the object first:
Wilcoxon_trip_3_stability <- bind_rows(as.data.frame(extr.stab.Taxa.trip3.comp1) %>%
                   dplyr::rename(., Stability_scores_Trip_3 = extr.stab.Taxa.trip3.comp1) %>%
                   mutate(Comparison = "1_Taxa"),
                 as.data.frame(extr.stab.GOs.trip3.comp1) %>%
                   dplyr::rename(., Stability_scores_Trip_3 = extr.stab.GOs.trip3.comp1) %>% 
                   mutate(Comparison = "2_Functions"))
# Exporting this as a numerical value as well:
write.csv(Wilcoxon_trip_3_stability, "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/output_tables/trip_3_sPLS_stability_scores.csv", row.names = F)

# Stability scores - getting median and SD: https://stackoverflow.com/questions/13372734/how-to-display-the-median-value-in-a-faceted-boxplot-in-ggplot
trip_3_stability_numerical_summ <- ddply(Wilcoxon_trip_3_stability,
                                              .(Comparison),
                                              summarize,
                                              med = median(Stability_scores_Trip_3),
                                              SD = sd(Stability_scores_Trip_3))

# Now plotting:
ggplot(Wilcoxon_trip_3_stability,
       aes(x = Comparison, y = Stability_scores_Trip_3)) +
  geom_boxplot(fill = "red3", outlier.shape = NA) +
  geom_jitter(size=0.8, alpha=0.5) +
  geom_text(data = trip_3_stability_numerical_summ, aes(y = med, label = round(med,2)),size = 4.5, vjust = -0.5) + # adding median as text
  geom_text(data = trip_3_stability_numerical_summ, aes(y = SD, label = round(SD,2)),size = 4.5, vjust = -0.5) + # adding median as text  
  ylim(0,1) +
  labs(#x = "Microbial function",
       y = "Re-occurrence of indicator features at PC 1 (4-fold CV, 50 reps)"
       ) +
  theme_bw() +
  stat_pvalue_manual(Wilcoxon_trip_3_stability %>% 
                       pairwise_wilcox_test(Stability_scores_Trip_3 ~ Comparison) %>% 
                       add_xy_position())

# Then I edited this manually in Inkscape

Trip 4

# Optimal parameters
keepX <- c(50, 50) # Keeping the same value across all trips!
trip4.final.spls2.WQ.taxa <- spls(X = OTUs_Trip4,
                            Y = metadata_Trip4[,24:40], # Choosing only medians!
                            ncomp = 2,
                            keepX = keepX,
                            mode = "regression")
trip4.final.spls2.WQ.GOs <- spls(X = GOs_Trip4,
                            Y = metadata_Trip4[,24:40], # Choosing only medians!
                            ncomp = 2,
                            keepX = keepX,
                            mode = "regression")

We can examine how frequently each variable is selected when we subsample the data using the perf() function to measure how stable the signature is.

stab.trip4.final.spls2.WQ.taxa <- perf(trip4.final.spls2.WQ.taxa,
                                       validation = 'Mfold',
                                       folds = 4,
                                       nrepeat = 20)
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
## Warning: The SGCCA algorithm did not converge
stab.trip4.final.spls2.WQ.GOs <- perf(trip4.final.spls2.WQ.GOs,
                                       validation = 'Mfold',
                                       folds = 4,
                                       nrepeat = 50)
# Extract stability - Dimension 1
stab.Taxa.trip4.comp1 <- stab.trip4.final.spls2.WQ.taxa$features$stability.X$comp1
stab.GOs.trip4.comp1 <- stab.trip4.final.spls2.WQ.GOs$features$stability.X$comp1
# We extract the stability measures of only the variables selected in spls2
extr.stab.Taxa.trip4.comp1 <- stab.Taxa.trip4.comp1[selectVar(trip4.final.spls2.WQ.taxa,
comp =1)$X$name]
extr.stab.GOs.trip4.comp1 <- stab.GOs.trip4.comp1[selectVar(trip4.final.spls2.WQ.GOs,
comp =1)$X$name]

# Plotting stability scores as boxplots, while simultaneously doing the Wilcoxon rank sum test - are these differences statistically significant?

# Preparing the object first:
Wilcoxon_trip_4_stability <- bind_rows(as.data.frame(extr.stab.Taxa.trip4.comp1) %>%
                   dplyr::rename(., Stability_scores_Trip_4 = extr.stab.Taxa.trip4.comp1) %>%
                   mutate(Comparison = "1_Taxa"),
                 as.data.frame(extr.stab.GOs.trip4.comp1) %>%
                   dplyr::rename(., Stability_scores_Trip_4 = extr.stab.GOs.trip4.comp1) %>% 
                   mutate(Comparison = "2_Functions"))
# Exporting this as a numerical value as well:
write.csv(Wilcoxon_trip_4_stability, "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/output_tables/trip_4_sPLS_stability_scores.csv", row.names = F)

# Stability scores - getting median and SD: https://stackoverflow.com/questions/13372734/how-to-display-the-median-value-in-a-faceted-boxplot-in-ggplot
trip_4_stability_numerical_summ <- ddply(Wilcoxon_trip_4_stability,
                                              .(Comparison),
                                              summarize,
                                              med = median(Stability_scores_Trip_4),
                                              SD = sd(Stability_scores_Trip_4))

# Now plotting:
ggplot(Wilcoxon_trip_4_stability,
       aes(x = Comparison, y = Stability_scores_Trip_4)) +
  geom_boxplot(fill = "slateblue", outlier.shape = NA) +
  geom_jitter(size=0.8, alpha=0.5) +
  geom_text(data = trip_4_stability_numerical_summ, aes(y = med, label = round(med,2)),size = 4.5, vjust = -0.5) + # adding median as text
  geom_text(data = trip_4_stability_numerical_summ, aes(y = SD, label = round(SD,2)),size = 4.5, vjust = -0.5) + # adding median as text  
  ylim(0,1) +
  labs(#x = "Microbial function",
       y = "Re-occurrence of indicator features at PC 1 (4-fold CV, 50 reps)"
       ) +
  theme_bw() +
  stat_pvalue_manual(Wilcoxon_trip_4_stability %>% 
                       pairwise_wilcox_test(Stability_scores_Trip_4 ~ Comparison) %>% 
                       add_xy_position())
## Warning: Removed 59 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bracket()`).
## Warning: Removed 59 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_text()`).

# Then I edited this manually in Inkscape